[docs]defddpg(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=5000,epochs=100,replay_size=int(1e6),gamma=0.99,
- polyak=0.995,pi_lr=1e-3,q_lr=1e-3,batch_size=100,start_steps=10000,
- act_noise=0.1,max_ep_len=1000,logger_kwargs=dict(),save_freq=1):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- =========== ================ ======================================
- Symbol Shape Description
- =========== ================ ======================================
- ``pi`` (batch, act_dim) | Deterministically computes actions
- | from policy given states.
- ``q`` (batch,) | Gives the current estimate of Q* for
- | states in ``x_ph`` and actions in
- | ``a_ph``.
- ``q_pi`` (batch,) | Gives the composition of ``q`` and
- | ``pi`` for states in ``x_ph``:
- | q(x, pi(x)).
- =========== ================ ======================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to DDPG.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs to run and train agent.
-
- replay_size (int): Maximum length of replay buffer.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- polyak (float): Interpolation factor in polyak averaging for target
- networks. Target networks are updated towards main networks
- according to:
-
- .. math:: \\theta_{\\text{targ}} \\leftarrow
- \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
-
- where :math:`\\rho` is polyak. (Always between 0 and 1, usually
- close to 1.)
-
- pi_lr (float): Learning rate for policy.
-
- q_lr (float): Learning rate for Q-networks.
-
- batch_size (int): Minibatch size for SGD.
-
- start_steps (int): Number of steps for uniform-random action selection,
- before running real policy. Helps exploration.
-
- act_noise (float): Stddev for Gaussian exploration noise added to
- policy at training time. (At test time, no noise is added.)
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env,test_env=env_fn(),env_fn()
- obs_dim=env.observation_space.shape[0]
- act_dim=env.action_space.shape[0]
-
- # Action limit for clamping: critically, assumes all dimensions share the same bound!
- act_limit=env.action_space.high[0]
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph,x2_ph,r_ph,d_ph=core.placeholders(obs_dim,act_dim,obs_dim,None,None)
-
- # Main outputs from computation graph
- withtf.variable_scope('main'):
- pi,q,q_pi=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Target networks
- withtf.variable_scope('target'):
- # Note that the action placeholder going to actor_critic here is
- # irrelevant, because we only need q_targ(s, pi_targ(s)).
- pi_targ,_,q_pi_targ=actor_critic(x2_ph,a_ph,**ac_kwargs)
-
- # Experience buffer
- replay_buffer=ReplayBuffer(obs_dim=obs_dim,act_dim=act_dim,size=replay_size)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein['main/pi','main/q','main'])
- print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)
-
- # Bellman backup for Q function
- backup=tf.stop_gradient(r_ph+gamma*(1-d_ph)*q_pi_targ)
-
- # DDPG losses
- pi_loss=-tf.reduce_mean(q_pi)
- q_loss=tf.reduce_mean((q-backup)**2)
-
- # Separate train ops for pi, q
- pi_optimizer=tf.train.AdamOptimizer(learning_rate=pi_lr)
- q_optimizer=tf.train.AdamOptimizer(learning_rate=q_lr)
- train_pi_op=pi_optimizer.minimize(pi_loss,var_list=get_vars('main/pi'))
- train_q_op=q_optimizer.minimize(q_loss,var_list=get_vars('main/q'))
-
- # Polyak averaging for target variables
- target_update=tf.group([tf.assign(v_targ,polyak*v_targ+(1-polyak)*v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- # Initializing targets to match main variables
- target_init=tf.group([tf.assign(v_targ,v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
- sess.run(target_init)
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph,'a':a_ph},outputs={'pi':pi,'q':q})
-
- defget_action(o,noise_scale):
- a=sess.run(pi,feed_dict={x_ph:o.reshape(1,-1)})[0]
- a+=noise_scale*np.random.randn(act_dim)
- returnnp.clip(a,-act_limit,act_limit)
-
- deftest_agent(n=10):
- forjinrange(n):
- o,r,d,ep_ret,ep_len=test_env.reset(),0,False,0,0
- whilenot(dor(ep_len==max_ep_len)):
- # Take deterministic actions at test time (noise_scale=0)
- o,r,d,_=test_env.step(get_action(o,0))
- ep_ret+=r
- ep_len+=1
- logger.store(TestEpRet=ep_ret,TestEpLen=ep_len)
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
- total_steps=steps_per_epoch*epochs
-
- # Main loop: collect experience in env and update/log each epoch
- fortinrange(total_steps):
-
- """
- Until start_steps have elapsed, randomly sample actions
- from a uniform distribution for better exploration. Afterwards,
- use the learned policy (with some noise, via act_noise).
- """
- ift>start_steps:
- a=get_action(o,act_noise)
- else:
- a=env.action_space.sample()
-
- # Step the env
- o2,r,d,_=env.step(a)
- ep_ret+=r
- ep_len+=1
-
- # Ignore the "done" signal if it comes from hitting the time
- # horizon (that is, when it's an artificial terminal signal
- # that isn't based on the agent's state)
- d=Falseifep_len==max_ep_lenelsed
-
- # Store experience to replay buffer
- replay_buffer.store(o,a,r,o2,d)
-
- # Super critical, easy to overlook step: make sure to update
- # most recent observation!
- o=o2
-
- ifdor(ep_len==max_ep_len):
- """
- Perform all DDPG updates at the end of the trajectory,
- in accordance with tuning done by TD3 paper authors.
- """
- for_inrange(ep_len):
- batch=replay_buffer.sample_batch(batch_size)
- feed_dict={x_ph:batch['obs1'],
- x2_ph:batch['obs2'],
- a_ph:batch['acts'],
- r_ph:batch['rews'],
- d_ph:batch['done']
- }
-
- # Q-learning update
- outs=sess.run([q_loss,q,train_q_op],feed_dict)
- logger.store(LossQ=outs[0],QVals=outs[1])
-
- # Policy update
- outs=sess.run([pi_loss,train_pi_op,target_update],feed_dict)
- logger.store(LossPi=outs[0])
-
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # End of epoch wrap-up
- ift>0andt%steps_per_epoch==0:
- epoch=t//steps_per_epoch
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Test the performance of the deterministic version of the agent.
- test_agent()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('TestEpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('TestEpLen',average_only=True)
- logger.log_tabular('TotalEnvInteracts',t)
- logger.log_tabular('QVals',with_min_and_max=True)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossQ',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
-importnumpyasnp
-importtensorflowastf
-importgym
-importtime
-importspinup.algos.ppo.coreascore
-fromspinup.utils.logximportEpochLogger
-fromspinup.utils.mpi_tfimportMpiAdamOptimizer,sync_all_params
-fromspinup.utils.mpi_toolsimportmpi_fork,mpi_avg,proc_id,mpi_statistics_scalar,num_procs
-
-
-classPPOBuffer:
- """
- A buffer for storing trajectories experienced by a PPO agent interacting
- with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
- for calculating the advantages of state-action pairs.
- """
-
- def__init__(self,obs_dim,act_dim,size,gamma=0.99,lam=0.95):
- self.obs_buf=np.zeros(core.combined_shape(size,obs_dim),dtype=np.float32)
- self.act_buf=np.zeros(core.combined_shape(size,act_dim),dtype=np.float32)
- self.adv_buf=np.zeros(size,dtype=np.float32)
- self.rew_buf=np.zeros(size,dtype=np.float32)
- self.ret_buf=np.zeros(size,dtype=np.float32)
- self.val_buf=np.zeros(size,dtype=np.float32)
- self.logp_buf=np.zeros(size,dtype=np.float32)
- self.gamma,self.lam=gamma,lam
- self.ptr,self.path_start_idx,self.max_size=0,0,size
-
- defstore(self,obs,act,rew,val,logp):
- """
- Append one timestep of agent-environment interaction to the buffer.
- """
- assertself.ptr<self.max_size# buffer has to have room so you can store
- self.obs_buf[self.ptr]=obs
- self.act_buf[self.ptr]=act
- self.rew_buf[self.ptr]=rew
- self.val_buf[self.ptr]=val
- self.logp_buf[self.ptr]=logp
- self.ptr+=1
-
- deffinish_path(self,last_val=0):
- """
- Call this at the end of a trajectory, or when one gets cut off
- by an epoch ending. This looks back in the buffer to where the
- trajectory started, and uses rewards and value estimates from
- the whole trajectory to compute advantage estimates with GAE-Lambda,
- as well as compute the rewards-to-go for each state, to use as
- the targets for the value function.
-
- The "last_val" argument should be 0 if the trajectory ended
- because the agent reached a terminal state (died), and otherwise
- should be V(s_T), the value function estimated for the last state.
- This allows us to bootstrap the reward-to-go calculation to account
- for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
- """
-
- path_slice=slice(self.path_start_idx,self.ptr)
- rews=np.append(self.rew_buf[path_slice],last_val)
- vals=np.append(self.val_buf[path_slice],last_val)
-
- # the next two lines implement GAE-Lambda advantage calculation
- deltas=rews[:-1]+self.gamma*vals[1:]-vals[:-1]
- self.adv_buf[path_slice]=core.discount_cumsum(deltas,self.gamma*self.lam)
-
- # the next line computes rewards-to-go, to be targets for the value function
- self.ret_buf[path_slice]=core.discount_cumsum(rews,self.gamma)[:-1]
-
- self.path_start_idx=self.ptr
-
- defget(self):
- """
- Call this at the end of an epoch to get all of the data from
- the buffer, with advantages appropriately normalized (shifted to have
- mean zero and std one). Also, resets some pointers in the buffer.
- """
- assertself.ptr==self.max_size# buffer has to be full before you can get
- self.ptr,self.path_start_idx=0,0
- # the next two lines implement the advantage normalization trick
- adv_mean,adv_std=mpi_statistics_scalar(self.adv_buf)
- self.adv_buf=(self.adv_buf-adv_mean)/adv_std
- return[self.obs_buf,self.act_buf,self.adv_buf,
- self.ret_buf,self.logp_buf]
-
-
-"""
-
-Proximal Policy Optimization (by clipping),
-
-with early stopping based on approximate KL
-
-"""
-
[docs]defppo(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=4000,epochs=50,gamma=0.99,clip_ratio=0.2,pi_lr=3e-4,
- vf_lr=1e-3,train_pi_iters=80,train_v_iters=80,lam=0.97,max_ep_len=1000,
- target_kl=0.01,logger_kwargs=dict(),save_freq=10):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- =========== ================ ======================================
- Symbol Shape Description
- =========== ================ ======================================
- ``pi`` (batch, act_dim) | Samples actions from policy given
- | states.
- ``logp`` (batch,) | Gives log probability, according to
- | the policy, of taking actions ``a_ph``
- | in states ``x_ph``.
- ``logp_pi`` (batch,) | Gives log probability, according to
- | the policy, of the action sampled by
- | ``pi``.
- ``v`` (batch,) | Gives the value estimate for states
- | in ``x_ph``. (Critical: make sure
- | to flatten this!)
- =========== ================ ======================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to PPO.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs of interaction (equivalent to
- number of policy updates) to perform.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- clip_ratio (float): Hyperparameter for clipping in the policy objective.
- Roughly: how far can the new policy go from the old policy while
- still profiting (improving the objective function)? The new policy
- can still go farther than the clip_ratio says, but it doesn't help
- on the objective anymore. (Usually small, 0.1 to 0.3.)
-
- pi_lr (float): Learning rate for policy optimizer.
-
- vf_lr (float): Learning rate for value function optimizer.
-
- train_pi_iters (int): Maximum number of gradient descent steps to take
- on policy loss per epoch. (Early stopping may cause optimizer
- to take fewer than this.)
-
- train_v_iters (int): Number of gradient descent steps to take on
- value function per epoch.
-
- lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
- close to 1.)
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- target_kl (float): Roughly what KL divergence we think is appropriate
- between new and old policies after an update. This will get used
- for early stopping. (Usually small, 0.01 or 0.05.)
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- seed+=10000*proc_id()
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env=env_fn()
- obs_dim=env.observation_space.shape
- act_dim=env.action_space.shape
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph=core.placeholders_from_spaces(env.observation_space,env.action_space)
- adv_ph,ret_ph,logp_old_ph=core.placeholders(None,None,None)
-
- # Main outputs from computation graph
- pi,logp,logp_pi,v=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Need all placeholders in *this* order later (to zip with data from buffer)
- all_phs=[x_ph,a_ph,adv_ph,ret_ph,logp_old_ph]
-
- # Every step, get: action, value, and logprob
- get_action_ops=[pi,v,logp_pi]
-
- # Experience buffer
- local_steps_per_epoch=int(steps_per_epoch/num_procs())
- buf=PPOBuffer(obs_dim,act_dim,local_steps_per_epoch,gamma,lam)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein['pi','v'])
- logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)
-
- # PPO objectives
- ratio=tf.exp(logp-logp_old_ph)# pi(a|s) / pi_old(a|s)
- min_adv=tf.where(adv_ph>0,(1+clip_ratio)*adv_ph,(1-clip_ratio)*adv_ph)
- pi_loss=-tf.reduce_mean(tf.minimum(ratio*adv_ph,min_adv))
- v_loss=tf.reduce_mean((ret_ph-v)**2)
-
- # Info (useful to watch during learning)
- approx_kl=tf.reduce_mean(logp_old_ph-logp)# a sample estimate for KL-divergence, easy to compute
- approx_ent=tf.reduce_mean(-logp)# a sample estimate for entropy, also easy to compute
- clipped=tf.logical_or(ratio>(1+clip_ratio),ratio<(1-clip_ratio))
- clipfrac=tf.reduce_mean(tf.cast(clipped,tf.float32))
-
- # Optimizers
- train_pi=MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
- train_v=MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
-
- # Sync params across processes
- sess.run(sync_all_params())
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph},outputs={'pi':pi,'v':v})
-
- defupdate():
- inputs={k:vfork,vinzip(all_phs,buf.get())}
- pi_l_old,v_l_old,ent=sess.run([pi_loss,v_loss,approx_ent],feed_dict=inputs)
-
- # Training
- foriinrange(train_pi_iters):
- _,kl=sess.run([train_pi,approx_kl],feed_dict=inputs)
- kl=mpi_avg(kl)
- ifkl>1.5*target_kl:
- logger.log('Early stopping at step %d due to reaching max kl.'%i)
- break
- logger.store(StopIter=i)
- for_inrange(train_v_iters):
- sess.run(train_v,feed_dict=inputs)
-
- # Log changes from update
- pi_l_new,v_l_new,kl,cf=sess.run([pi_loss,v_loss,approx_kl,clipfrac],feed_dict=inputs)
- logger.store(LossPi=pi_l_old,LossV=v_l_old,
- KL=kl,Entropy=ent,ClipFrac=cf,
- DeltaLossPi=(pi_l_new-pi_l_old),
- DeltaLossV=(v_l_new-v_l_old))
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Main loop: collect experience in env and update/log each epoch
- forepochinrange(epochs):
- fortinrange(local_steps_per_epoch):
- a,v_t,logp_t=sess.run(get_action_ops,feed_dict={x_ph:o.reshape(1,-1)})
-
- # save and log
- buf.store(o,a,r,v_t,logp_t)
- logger.store(VVals=v_t)
-
- o,r,d,_=env.step(a[0])
- ep_ret+=r
- ep_len+=1
-
- terminal=dor(ep_len==max_ep_len)
- ifterminalor(t==local_steps_per_epoch-1):
- ifnot(terminal):
- print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
- # if trajectory didn't reach terminal state, bootstrap value target
- last_val=rifdelsesess.run(v,feed_dict={x_ph:o.reshape(1,-1)})
- buf.finish_path(last_val)
- ifterminal:
- # only save EpRet / EpLen if trajectory finished
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Perform PPO update!
- update()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('VVals',with_min_and_max=True)
- logger.log_tabular('TotalEnvInteracts',(epoch+1)*steps_per_epoch)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossV',average_only=True)
- logger.log_tabular('DeltaLossPi',average_only=True)
- logger.log_tabular('DeltaLossV',average_only=True)
- logger.log_tabular('Entropy',average_only=True)
- logger.log_tabular('KL',average_only=True)
- logger.log_tabular('ClipFrac',average_only=True)
- logger.log_tabular('StopIter',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
[docs]defsac(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=5000,epochs=100,replay_size=int(1e6),gamma=0.99,
- polyak=0.995,lr=1e-3,alpha=0.2,batch_size=100,start_steps=10000,
- max_ep_len=1000,logger_kwargs=dict(),save_freq=1):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- =========== ================ ======================================
- Symbol Shape Description
- =========== ================ ======================================
- ``mu`` (batch, act_dim) | Computes mean actions from policy
- | given states.
- ``pi`` (batch, act_dim) | Samples actions from policy given
- | states.
- ``logp_pi`` (batch,) | Gives log probability, according to
- | the policy, of the action sampled by
- | ``pi``. Critical: must be differentiable
- | with respect to policy parameters all
- | the way through action sampling.
- ``q1`` (batch,) | Gives one estimate of Q* for
- | states in ``x_ph`` and actions in
- | ``a_ph``.
- ``q2`` (batch,) | Gives another estimate of Q* for
- | states in ``x_ph`` and actions in
- | ``a_ph``.
- ``q1_pi`` (batch,) | Gives the composition of ``q1`` and
- | ``pi`` for states in ``x_ph``:
- | q1(x, pi(x)).
- ``q2_pi`` (batch,) | Gives the composition of ``q2`` and
- | ``pi`` for states in ``x_ph``:
- | q2(x, pi(x)).
- ``v`` (batch,) | Gives the value estimate for states
- | in ``x_ph``.
- =========== ================ ======================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to SAC.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs to run and train agent.
-
- replay_size (int): Maximum length of replay buffer.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- polyak (float): Interpolation factor in polyak averaging for target
- networks. Target networks are updated towards main networks
- according to:
-
- .. math:: \\theta_{\\text{targ}} \\leftarrow
- \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
-
- where :math:`\\rho` is polyak. (Always between 0 and 1, usually
- close to 1.)
-
- lr (float): Learning rate (used for both policy and value learning).
-
- alpha (float): Entropy regularization coefficient. (Equivalent to
- inverse of reward scale in the original SAC paper.)
-
- batch_size (int): Minibatch size for SGD.
-
- start_steps (int): Number of steps for uniform-random action selection,
- before running real policy. Helps exploration.
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env,test_env=env_fn(),env_fn()
- obs_dim=env.observation_space.shape[0]
- act_dim=env.action_space.shape[0]
-
- # Action limit for clamping: critically, assumes all dimensions share the same bound!
- act_limit=env.action_space.high[0]
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph,x2_ph,r_ph,d_ph=core.placeholders(obs_dim,act_dim,obs_dim,None,None)
-
- # Main outputs from computation graph
- withtf.variable_scope('main'):
- mu,pi,logp_pi,q1,q2,q1_pi,q2_pi,v=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Target value network
- withtf.variable_scope('target'):
- _,_,_,_,_,_,_,v_targ=actor_critic(x2_ph,a_ph,**ac_kwargs)
-
- # Experience buffer
- replay_buffer=ReplayBuffer(obs_dim=obs_dim,act_dim=act_dim,size=replay_size)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein
- ['main/pi','main/q1','main/q2','main/v','main'])
- print(('\nNumber of parameters: \t pi: %d, \t'+ \
- 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)
-
- # Min Double-Q:
- min_q_pi=tf.minimum(q1_pi,q2_pi)
-
- # Targets for Q and V regression
- q_backup=tf.stop_gradient(r_ph+gamma*(1-d_ph)*v_targ)
- v_backup=tf.stop_gradient(min_q_pi-alpha*logp_pi)
-
- # Soft actor-critic losses
- pi_loss=tf.reduce_mean(alpha*logp_pi-q1_pi)
- q1_loss=0.5*tf.reduce_mean((q_backup-q1)**2)
- q2_loss=0.5*tf.reduce_mean((q_backup-q2)**2)
- v_loss=0.5*tf.reduce_mean((v_backup-v)**2)
- value_loss=q1_loss+q2_loss+v_loss
-
- # Policy train op
- # (has to be separate from value train op, because q1_pi appears in pi_loss)
- pi_optimizer=tf.train.AdamOptimizer(learning_rate=lr)
- train_pi_op=pi_optimizer.minimize(pi_loss,var_list=get_vars('main/pi'))
-
- # Value train op
- # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
- value_optimizer=tf.train.AdamOptimizer(learning_rate=lr)
- value_params=get_vars('main/q')+get_vars('main/v')
- withtf.control_dependencies([train_pi_op]):
- train_value_op=value_optimizer.minimize(value_loss,var_list=value_params)
-
- # Polyak averaging for target variables
- # (control flow because sess.run otherwise evaluates in nondeterministic order)
- withtf.control_dependencies([train_value_op]):
- target_update=tf.group([tf.assign(v_targ,polyak*v_targ+(1-polyak)*v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- # All ops to call during one training step
- step_ops=[pi_loss,q1_loss,q2_loss,v_loss,q1,q2,v,logp_pi,
- train_pi_op,train_value_op,target_update]
-
- # Initializing targets to match main variables
- target_init=tf.group([tf.assign(v_targ,v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
- sess.run(target_init)
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph,'a':a_ph},
- outputs={'mu':mu,'pi':pi,'q1':q1,'q2':q2,'v':v})
-
- defget_action(o,deterministic=False):
- act_op=muifdeterministicelsepi
- returnsess.run(act_op,feed_dict={x_ph:o.reshape(1,-1)})[0]
-
- deftest_agent(n=10):
- globalsess,mu,pi,q1,q2,q1_pi,q2_pi
- forjinrange(n):
- o,r,d,ep_ret,ep_len=test_env.reset(),0,False,0,0
- whilenot(dor(ep_len==max_ep_len)):
- # Take deterministic actions at test time
- o,r,d,_=test_env.step(get_action(o,True))
- ep_ret+=r
- ep_len+=1
- logger.store(TestEpRet=ep_ret,TestEpLen=ep_len)
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
- total_steps=steps_per_epoch*epochs
-
- # Main loop: collect experience in env and update/log each epoch
- fortinrange(total_steps):
-
- """
- Until start_steps have elapsed, randomly sample actions
- from a uniform distribution for better exploration. Afterwards,
- use the learned policy.
- """
- ift>start_steps:
- a=get_action(o)
- else:
- a=env.action_space.sample()
-
- # Step the env
- o2,r,d,_=env.step(a)
- ep_ret+=r
- ep_len+=1
-
- # Ignore the "done" signal if it comes from hitting the time
- # horizon (that is, when it's an artificial terminal signal
- # that isn't based on the agent's state)
- d=Falseifep_len==max_ep_lenelsed
-
- # Store experience to replay buffer
- replay_buffer.store(o,a,r,o2,d)
-
- # Super critical, easy to overlook step: make sure to update
- # most recent observation!
- o=o2
-
- ifdor(ep_len==max_ep_len):
- """
- Perform all SAC updates at the end of the trajectory.
- This is a slight difference from the SAC specified in the
- original paper.
- """
- forjinrange(ep_len):
- batch=replay_buffer.sample_batch(batch_size)
- feed_dict={x_ph:batch['obs1'],
- x2_ph:batch['obs2'],
- a_ph:batch['acts'],
- r_ph:batch['rews'],
- d_ph:batch['done'],
- }
- outs=sess.run(step_ops,feed_dict)
- logger.store(LossPi=outs[0],LossQ1=outs[1],LossQ2=outs[2],
- LossV=outs[3],Q1Vals=outs[4],Q2Vals=outs[5],
- VVals=outs[6],LogPi=outs[7])
-
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
-
- # End of epoch wrap-up
- ift>0andt%steps_per_epoch==0:
- epoch=t//steps_per_epoch
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Test the performance of the deterministic version of the agent.
- test_agent()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('TestEpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('TestEpLen',average_only=True)
- logger.log_tabular('TotalEnvInteracts',t)
- logger.log_tabular('Q1Vals',with_min_and_max=True)
- logger.log_tabular('Q2Vals',with_min_and_max=True)
- logger.log_tabular('VVals',with_min_and_max=True)
- logger.log_tabular('LogPi',with_min_and_max=True)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossQ1',average_only=True)
- logger.log_tabular('LossQ2',average_only=True)
- logger.log_tabular('LossV',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
[docs]deftd3(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=5000,epochs=100,replay_size=int(1e6),gamma=0.99,
- polyak=0.995,pi_lr=1e-3,q_lr=1e-3,batch_size=100,start_steps=10000,
- act_noise=0.1,target_noise=0.2,noise_clip=0.5,policy_delay=2,
- max_ep_len=1000,logger_kwargs=dict(),save_freq=1):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- =========== ================ ======================================
- Symbol Shape Description
- =========== ================ ======================================
- ``pi`` (batch, act_dim) | Deterministically computes actions
- | from policy given states.
- ``q1`` (batch,) | Gives one estimate of Q* for
- | states in ``x_ph`` and actions in
- | ``a_ph``.
- ``q2`` (batch,) | Gives another estimate of Q* for
- | states in ``x_ph`` and actions in
- | ``a_ph``.
- ``q1_pi`` (batch,) | Gives the composition of ``q1`` and
- | ``pi`` for states in ``x_ph``:
- | q1(x, pi(x)).
- =========== ================ ======================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to TD3.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs to run and train agent.
-
- replay_size (int): Maximum length of replay buffer.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- polyak (float): Interpolation factor in polyak averaging for target
- networks. Target networks are updated towards main networks
- according to:
-
- .. math:: \\theta_{\\text{targ}} \\leftarrow
- \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
-
- where :math:`\\rho` is polyak. (Always between 0 and 1, usually
- close to 1.)
-
- pi_lr (float): Learning rate for policy.
-
- q_lr (float): Learning rate for Q-networks.
-
- batch_size (int): Minibatch size for SGD.
-
- start_steps (int): Number of steps for uniform-random action selection,
- before running real policy. Helps exploration.
-
- act_noise (float): Stddev for Gaussian exploration noise added to
- policy at training time. (At test time, no noise is added.)
-
- target_noise (float): Stddev for smoothing noise added to target
- policy.
-
- noise_clip (float): Limit for absolute value of target policy
- smoothing noise.
-
- policy_delay (int): Policy will only be updated once every
- policy_delay times for each update of the Q-networks.
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env,test_env=env_fn(),env_fn()
- obs_dim=env.observation_space.shape[0]
- act_dim=env.action_space.shape[0]
-
- # Action limit for clamping: critically, assumes all dimensions share the same bound!
- act_limit=env.action_space.high[0]
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph,x2_ph,r_ph,d_ph=core.placeholders(obs_dim,act_dim,obs_dim,None,None)
-
- # Main outputs from computation graph
- withtf.variable_scope('main'):
- pi,q1,q2,q1_pi=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Target policy network
- withtf.variable_scope('target'):
- pi_targ,_,_,_=actor_critic(x2_ph,a_ph,**ac_kwargs)
-
- # Target Q networks
- withtf.variable_scope('target',reuse=True):
-
- # Target policy smoothing, by adding clipped noise to target actions
- epsilon=tf.random_normal(tf.shape(pi_targ),stddev=target_noise)
- epsilon=tf.clip_by_value(epsilon,-noise_clip,noise_clip)
- a2=pi_targ+epsilon
- a2=tf.clip_by_value(a2,-act_limit,act_limit)
-
- # Target Q-values, using action from target policy
- _,q1_targ,q2_targ,_=actor_critic(x2_ph,a2,**ac_kwargs)
-
- # Experience buffer
- replay_buffer=ReplayBuffer(obs_dim=obs_dim,act_dim=act_dim,size=replay_size)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein['main/pi','main/q1','main/q2','main'])
- print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'%var_counts)
-
- # Bellman backup for Q functions, using Clipped Double-Q targets
- min_q_targ=tf.minimum(q1_targ,q2_targ)
- backup=tf.stop_gradient(r_ph+gamma*(1-d_ph)*min_q_targ)
-
- # TD3 losses
- pi_loss=-tf.reduce_mean(q1_pi)
- q1_loss=tf.reduce_mean((q1-backup)**2)
- q2_loss=tf.reduce_mean((q2-backup)**2)
- q_loss=q1_loss+q2_loss
-
- # Separate train ops for pi, q
- pi_optimizer=tf.train.AdamOptimizer(learning_rate=pi_lr)
- q_optimizer=tf.train.AdamOptimizer(learning_rate=q_lr)
- train_pi_op=pi_optimizer.minimize(pi_loss,var_list=get_vars('main/pi'))
- train_q_op=q_optimizer.minimize(q_loss,var_list=get_vars('main/q'))
-
- # Polyak averaging for target variables
- target_update=tf.group([tf.assign(v_targ,polyak*v_targ+(1-polyak)*v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- # Initializing targets to match main variables
- target_init=tf.group([tf.assign(v_targ,v_main)
- forv_main,v_targinzip(get_vars('main'),get_vars('target'))])
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
- sess.run(target_init)
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph,'a':a_ph},outputs={'pi':pi,'q1':q1,'q2':q2})
-
- defget_action(o,noise_scale):
- a=sess.run(pi,feed_dict={x_ph:o.reshape(1,-1)})[0]
- a+=noise_scale*np.random.randn(act_dim)
- returnnp.clip(a,-act_limit,act_limit)
-
- deftest_agent(n=10):
- forjinrange(n):
- o,r,d,ep_ret,ep_len=test_env.reset(),0,False,0,0
- whilenot(dor(ep_len==max_ep_len)):
- # Take deterministic actions at test time (noise_scale=0)
- o,r,d,_=test_env.step(get_action(o,0))
- ep_ret+=r
- ep_len+=1
- logger.store(TestEpRet=ep_ret,TestEpLen=ep_len)
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
- total_steps=steps_per_epoch*epochs
-
- # Main loop: collect experience in env and update/log each epoch
- fortinrange(total_steps):
-
- """
- Until start_steps have elapsed, randomly sample actions
- from a uniform distribution for better exploration. Afterwards,
- use the learned policy (with some noise, via act_noise).
- """
- ift>start_steps:
- a=get_action(o,act_noise)
- else:
- a=env.action_space.sample()
-
- # Step the env
- o2,r,d,_=env.step(a)
- ep_ret+=r
- ep_len+=1
-
- # Ignore the "done" signal if it comes from hitting the time
- # horizon (that is, when it's an artificial terminal signal
- # that isn't based on the agent's state)
- d=Falseifep_len==max_ep_lenelsed
-
- # Store experience to replay buffer
- replay_buffer.store(o,a,r,o2,d)
-
- # Super critical, easy to overlook step: make sure to update
- # most recent observation!
- o=o2
-
- ifdor(ep_len==max_ep_len):
- """
- Perform all TD3 updates at the end of the trajectory
- (in accordance with source code of TD3 published by
- original authors).
- """
- forjinrange(ep_len):
- batch=replay_buffer.sample_batch(batch_size)
- feed_dict={x_ph:batch['obs1'],
- x2_ph:batch['obs2'],
- a_ph:batch['acts'],
- r_ph:batch['rews'],
- d_ph:batch['done']
- }
- q_step_ops=[q_loss,q1,q2,train_q_op]
- outs=sess.run(q_step_ops,feed_dict)
- logger.store(LossQ=outs[0],Q1Vals=outs[1],Q2Vals=outs[2])
-
- ifj%policy_delay==0:
- # Delayed policy update
- outs=sess.run([pi_loss,train_pi_op,target_update],feed_dict)
- logger.store(LossPi=outs[0])
-
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # End of epoch wrap-up
- ift>0andt%steps_per_epoch==0:
- epoch=t//steps_per_epoch
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Test the performance of the deterministic version of the agent.
- test_agent()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('TestEpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('TestEpLen',average_only=True)
- logger.log_tabular('TotalEnvInteracts',t)
- logger.log_tabular('Q1Vals',with_min_and_max=True)
- logger.log_tabular('Q2Vals',with_min_and_max=True)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossQ',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
-importnumpyasnp
-importtensorflowastf
-importgym
-importtime
-importspinup.algos.trpo.coreascore
-fromspinup.utils.logximportEpochLogger
-fromspinup.utils.mpi_tfimportMpiAdamOptimizer,sync_all_params
-fromspinup.utils.mpi_toolsimportmpi_fork,mpi_avg,proc_id,mpi_statistics_scalar,num_procs
-
-
-EPS=1e-8
-
-classGAEBuffer:
- """
- A buffer for storing trajectories experienced by a TRPO agent interacting
- with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
- for calculating the advantages of state-action pairs.
- """
-
- def__init__(self,obs_dim,act_dim,size,info_shapes,gamma=0.99,lam=0.95):
- self.obs_buf=np.zeros(core.combined_shape(size,obs_dim),dtype=np.float32)
- self.act_buf=np.zeros(core.combined_shape(size,act_dim),dtype=np.float32)
- self.adv_buf=np.zeros(size,dtype=np.float32)
- self.rew_buf=np.zeros(size,dtype=np.float32)
- self.ret_buf=np.zeros(size,dtype=np.float32)
- self.val_buf=np.zeros(size,dtype=np.float32)
- self.logp_buf=np.zeros(size,dtype=np.float32)
- self.info_bufs={k:np.zeros([size]+list(v),dtype=np.float32)fork,vininfo_shapes.items()}
- self.sorted_info_keys=core.keys_as_sorted_list(self.info_bufs)
- self.gamma,self.lam=gamma,lam
- self.ptr,self.path_start_idx,self.max_size=0,0,size
-
- defstore(self,obs,act,rew,val,logp,info):
- """
- Append one timestep of agent-environment interaction to the buffer.
- """
- assertself.ptr<self.max_size# buffer has to have room so you can store
- self.obs_buf[self.ptr]=obs
- self.act_buf[self.ptr]=act
- self.rew_buf[self.ptr]=rew
- self.val_buf[self.ptr]=val
- self.logp_buf[self.ptr]=logp
- fori,kinenumerate(self.sorted_info_keys):
- self.info_bufs[k][self.ptr]=info[i]
- self.ptr+=1
-
- deffinish_path(self,last_val=0):
- """
- Call this at the end of a trajectory, or when one gets cut off
- by an epoch ending. This looks back in the buffer to where the
- trajectory started, and uses rewards and value estimates from
- the whole trajectory to compute advantage estimates with GAE-Lambda,
- as well as compute the rewards-to-go for each state, to use as
- the targets for the value function.
-
- The "last_val" argument should be 0 if the trajectory ended
- because the agent reached a terminal state (died), and otherwise
- should be V(s_T), the value function estimated for the last state.
- This allows us to bootstrap the reward-to-go calculation to account
- for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
- """
-
- path_slice=slice(self.path_start_idx,self.ptr)
- rews=np.append(self.rew_buf[path_slice],last_val)
- vals=np.append(self.val_buf[path_slice],last_val)
-
- # the next two lines implement GAE-Lambda advantage calculation
- deltas=rews[:-1]+self.gamma*vals[1:]-vals[:-1]
- self.adv_buf[path_slice]=core.discount_cumsum(deltas,self.gamma*self.lam)
-
- # the next line computes rewards-to-go, to be targets for the value function
- self.ret_buf[path_slice]=core.discount_cumsum(rews,self.gamma)[:-1]
-
- self.path_start_idx=self.ptr
-
- defget(self):
- """
- Call this at the end of an epoch to get all of the data from
- the buffer, with advantages appropriately normalized (shifted to have
- mean zero and std one). Also, resets some pointers in the buffer.
- """
- assertself.ptr==self.max_size# buffer has to be full before you can get
- self.ptr,self.path_start_idx=0,0
- # the next two lines implement the advantage normalization trick
- adv_mean,adv_std=mpi_statistics_scalar(self.adv_buf)
- self.adv_buf=(self.adv_buf-adv_mean)/adv_std
- return[self.obs_buf,self.act_buf,self.adv_buf,self.ret_buf,
- self.logp_buf]+core.values_as_sorted_list(self.info_bufs)
-
-"""
-
-Trust Region Policy Optimization
-
-(with support for Natural Policy Gradient)
-
-"""
-
[docs]deftrpo(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=4000,epochs=50,gamma=0.99,delta=0.01,vf_lr=1e-3,
- train_v_iters=80,damping_coeff=0.1,cg_iters=10,backtrack_iters=10,
- backtrack_coeff=0.8,lam=0.97,max_ep_len=1000,logger_kwargs=dict(),
- save_freq=10,algo='trpo'):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- ============ ================ ========================================
- Symbol Shape Description
- ============ ================ ========================================
- ``pi`` (batch, act_dim) | Samples actions from policy given
- | states.
- ``logp`` (batch,) | Gives log probability, according to
- | the policy, of taking actions ``a_ph``
- | in states ``x_ph``.
- ``logp_pi`` (batch,) | Gives log probability, according to
- | the policy, of the action sampled by
- | ``pi``.
- ``info`` N/A | A dict of any intermediate quantities
- | (from calculating the policy or log
- | probabilities) which are needed for
- | analytically computing KL divergence.
- | (eg sufficient statistics of the
- | distributions)
- ``info_phs`` N/A | A dict of placeholders for old values
- | of the entries in ``info``.
- ``d_kl`` () | A symbol for computing the mean KL
- | divergence between the current policy
- | (``pi``) and the old policy (as
- | specified by the inputs to
- | ``info_phs``) over the batch of
- | states given in ``x_ph``.
- ``v`` (batch,) | Gives the value estimate for states
- | in ``x_ph``. (Critical: make sure
- | to flatten this!)
- ============ ================ ========================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to TRPO.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs of interaction (equivalent to
- number of policy updates) to perform.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- delta (float): KL-divergence limit for TRPO / NPG update.
- (Should be small for stability. Values like 0.01, 0.05.)
-
- vf_lr (float): Learning rate for value function optimizer.
-
- train_v_iters (int): Number of gradient descent steps to take on
- value function per epoch.
-
- damping_coeff (float): Artifact for numerical stability, should be
- smallish. Adjusts Hessian-vector product calculation:
-
- .. math:: Hv \\rightarrow (\\alpha I + H)v
-
- where :math:`\\alpha` is the damping coefficient.
- Probably don't play with this hyperparameter.
-
- cg_iters (int): Number of iterations of conjugate gradient to perform.
- Increasing this will lead to a more accurate approximation
- to :math:`H^{-1} g`, and possibly slightly-improved performance,
- but at the cost of slowing things down.
-
- Also probably don't play with this hyperparameter.
-
- backtrack_iters (int): Maximum number of steps allowed in the
- backtracking line search. Since the line search usually doesn't
- backtrack, and usually only steps back once when it does, this
- hyperparameter doesn't often matter.
-
- backtrack_coeff (float): How far back to step during backtracking line
- search. (Always between 0 and 1, usually above 0.5.)
-
- lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
- close to 1.)
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- algo: Either 'trpo' or 'npg': this code supports both, since they are
- almost the same.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- seed+=10000*proc_id()
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env=env_fn()
- obs_dim=env.observation_space.shape
- act_dim=env.action_space.shape
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph=core.placeholders_from_spaces(env.observation_space,env.action_space)
- adv_ph,ret_ph,logp_old_ph=core.placeholders(None,None,None)
-
- # Main outputs from computation graph, plus placeholders for old pdist (for KL)
- pi,logp,logp_pi,info,info_phs,d_kl,v=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Need all placeholders in *this* order later (to zip with data from buffer)
- all_phs=[x_ph,a_ph,adv_ph,ret_ph,logp_old_ph]+core.values_as_sorted_list(info_phs)
-
- # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
- get_action_ops=[pi,v,logp_pi]+core.values_as_sorted_list(info)
-
- # Experience buffer
- local_steps_per_epoch=int(steps_per_epoch/num_procs())
- info_shapes={k:v.shape.as_list()[1:]fork,vininfo_phs.items()}
- buf=GAEBuffer(obs_dim,act_dim,local_steps_per_epoch,info_shapes,gamma,lam)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein['pi','v'])
- logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)
-
- # TRPO losses
- ratio=tf.exp(logp-logp_old_ph)# pi(a|s) / pi_old(a|s)
- pi_loss=-tf.reduce_mean(ratio*adv_ph)
- v_loss=tf.reduce_mean((ret_ph-v)**2)
-
- # Optimizer for value function
- train_vf=MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
-
- # Symbols needed for CG solver
- pi_params=core.get_vars('pi')
- gradient=core.flat_grad(pi_loss,pi_params)
- v_ph,hvp=core.hessian_vector_product(d_kl,pi_params)
- ifdamping_coeff>0:
- hvp+=damping_coeff*v_ph
-
- # Symbols for getting and setting params
- get_pi_params=core.flat_concat(pi_params)
- set_pi_params=core.assign_params_from_flat(v_ph,pi_params)
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
-
- # Sync params across processes
- sess.run(sync_all_params())
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph},outputs={'pi':pi,'v':v})
-
- defcg(Ax,b):
- """
- Conjugate gradient algorithm
- (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
- """
- x=np.zeros_like(b)
- r=b.copy()# Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
- p=r.copy()
- r_dot_old=np.dot(r,r)
- for_inrange(cg_iters):
- z=Ax(p)
- alpha=r_dot_old/(np.dot(p,z)+EPS)
- x+=alpha*p
- r-=alpha*z
- r_dot_new=np.dot(r,r)
- p=r+(r_dot_new/r_dot_old)*p
- r_dot_old=r_dot_new
- returnx
-
- defupdate():
- # Prepare hessian func, gradient eval
- inputs={k:vfork,vinzip(all_phs,buf.get())}
- Hx=lambdax:mpi_avg(sess.run(hvp,feed_dict={**inputs,v_ph:x}))
- g,pi_l_old,v_l_old=sess.run([gradient,pi_loss,v_loss],feed_dict=inputs)
- g,pi_l_old=mpi_avg(g),mpi_avg(pi_l_old)
-
- # Core calculations for TRPO or NPG
- x=cg(Hx,g)
- alpha=np.sqrt(2*delta/(np.dot(x,Hx(x))+EPS))
- old_params=sess.run(get_pi_params)
-
- defset_and_eval(step):
- sess.run(set_pi_params,feed_dict={v_ph:old_params-alpha*x*step})
- returnmpi_avg(sess.run([d_kl,pi_loss],feed_dict=inputs))
-
- ifalgo=='npg':
- # npg has no backtracking or hard kl constraint enforcement
- kl,pi_l_new=set_and_eval(step=1.)
-
- elifalgo=='trpo':
- # trpo augments npg with backtracking line search, hard kl
- forjinrange(backtrack_iters):
- kl,pi_l_new=set_and_eval(step=backtrack_coeff**j)
- ifkl<=deltaandpi_l_new<=pi_l_old:
- logger.log('Accepting new params at step %d of line search.'%j)
- logger.store(BacktrackIters=j)
- break
-
- ifj==backtrack_iters-1:
- logger.log('Line search failed! Keeping old params.')
- logger.store(BacktrackIters=j)
- kl,pi_l_new=set_and_eval(step=0.)
-
- # Value function updates
- for_inrange(train_v_iters):
- sess.run(train_vf,feed_dict=inputs)
- v_l_new=sess.run(v_loss,feed_dict=inputs)
-
- # Log changes from update
- logger.store(LossPi=pi_l_old,LossV=v_l_old,KL=kl,
- DeltaLossPi=(pi_l_new-pi_l_old),
- DeltaLossV=(v_l_new-v_l_old))
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Main loop: collect experience in env and update/log each epoch
- forepochinrange(epochs):
- fortinrange(local_steps_per_epoch):
- agent_outs=sess.run(get_action_ops,feed_dict={x_ph:o.reshape(1,-1)})
- a,v_t,logp_t,info_t=agent_outs[0][0],agent_outs[1],agent_outs[2],agent_outs[3:]
-
- # save and log
- buf.store(o,a,r,v_t,logp_t,info_t)
- logger.store(VVals=v_t)
-
- o,r,d,_=env.step(a)
- ep_ret+=r
- ep_len+=1
-
- terminal=dor(ep_len==max_ep_len)
- ifterminalor(t==local_steps_per_epoch-1):
- ifnot(terminal):
- print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
- # if trajectory didn't reach terminal state, bootstrap value target
- last_val=rifdelsesess.run(v,feed_dict={x_ph:o.reshape(1,-1)})
- buf.finish_path(last_val)
- ifterminal:
- # only save EpRet / EpLen if trajectory finished
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Perform TRPO or NPG update!
- update()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('VVals',with_min_and_max=True)
- logger.log_tabular('TotalEnvInteracts',(epoch+1)*steps_per_epoch)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossV',average_only=True)
- logger.log_tabular('DeltaLossPi',average_only=True)
- logger.log_tabular('DeltaLossV',average_only=True)
- logger.log_tabular('KL',average_only=True)
- ifalgo=='trpo':
- logger.log_tabular('BacktrackIters',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
-importnumpyasnp
-importtensorflowastf
-importgym
-importtime
-importspinup.algos.vpg.coreascore
-fromspinup.utils.logximportEpochLogger
-fromspinup.utils.mpi_tfimportMpiAdamOptimizer,sync_all_params
-fromspinup.utils.mpi_toolsimportmpi_fork,mpi_avg,proc_id,mpi_statistics_scalar,num_procs
-
-
-classVPGBuffer:
- """
- A buffer for storing trajectories experienced by a VPG agent interacting
- with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
- for calculating the advantages of state-action pairs.
- """
-
- def__init__(self,obs_dim,act_dim,size,gamma=0.99,lam=0.95):
- self.obs_buf=np.zeros(core.combined_shape(size,obs_dim),dtype=np.float32)
- self.act_buf=np.zeros(core.combined_shape(size,act_dim),dtype=np.float32)
- self.adv_buf=np.zeros(size,dtype=np.float32)
- self.rew_buf=np.zeros(size,dtype=np.float32)
- self.ret_buf=np.zeros(size,dtype=np.float32)
- self.val_buf=np.zeros(size,dtype=np.float32)
- self.logp_buf=np.zeros(size,dtype=np.float32)
- self.gamma,self.lam=gamma,lam
- self.ptr,self.path_start_idx,self.max_size=0,0,size
-
- defstore(self,obs,act,rew,val,logp):
- """
- Append one timestep of agent-environment interaction to the buffer.
- """
- assertself.ptr<self.max_size# buffer has to have room so you can store
- self.obs_buf[self.ptr]=obs
- self.act_buf[self.ptr]=act
- self.rew_buf[self.ptr]=rew
- self.val_buf[self.ptr]=val
- self.logp_buf[self.ptr]=logp
- self.ptr+=1
-
- deffinish_path(self,last_val=0):
- """
- Call this at the end of a trajectory, or when one gets cut off
- by an epoch ending. This looks back in the buffer to where the
- trajectory started, and uses rewards and value estimates from
- the whole trajectory to compute advantage estimates with GAE-Lambda,
- as well as compute the rewards-to-go for each state, to use as
- the targets for the value function.
-
- The "last_val" argument should be 0 if the trajectory ended
- because the agent reached a terminal state (died), and otherwise
- should be V(s_T), the value function estimated for the last state.
- This allows us to bootstrap the reward-to-go calculation to account
- for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
- """
-
- path_slice=slice(self.path_start_idx,self.ptr)
- rews=np.append(self.rew_buf[path_slice],last_val)
- vals=np.append(self.val_buf[path_slice],last_val)
-
- # the next two lines implement GAE-Lambda advantage calculation
- deltas=rews[:-1]+self.gamma*vals[1:]-vals[:-1]
- self.adv_buf[path_slice]=core.discount_cumsum(deltas,self.gamma*self.lam)
-
- # the next line computes rewards-to-go, to be targets for the value function
- self.ret_buf[path_slice]=core.discount_cumsum(rews,self.gamma)[:-1]
-
- self.path_start_idx=self.ptr
-
- defget(self):
- """
- Call this at the end of an epoch to get all of the data from
- the buffer, with advantages appropriately normalized (shifted to have
- mean zero and std one). Also, resets some pointers in the buffer.
- """
- assertself.ptr==self.max_size# buffer has to be full before you can get
- self.ptr,self.path_start_idx=0,0
- # the next two lines implement the advantage normalization trick
- adv_mean,adv_std=mpi_statistics_scalar(self.adv_buf)
- self.adv_buf=(self.adv_buf-adv_mean)/adv_std
- return[self.obs_buf,self.act_buf,self.adv_buf,
- self.ret_buf,self.logp_buf]
-
-
-"""
-
-Vanilla Policy Gradient
-
-(with GAE-Lambda for advantage estimation)
-
-"""
-
[docs]defvpg(env_fn,actor_critic=core.mlp_actor_critic,ac_kwargs=dict(),seed=0,
- steps_per_epoch=4000,epochs=50,gamma=0.99,pi_lr=3e-4,
- vf_lr=1e-3,train_v_iters=80,lam=0.97,max_ep_len=1000,
- logger_kwargs=dict(),save_freq=10):
- """
-
- Args:
- env_fn : A function which creates a copy of the environment.
- The environment must satisfy the OpenAI Gym API.
-
- actor_critic: A function which takes in placeholder symbols
- for state, ``x_ph``, and action, ``a_ph``, and returns the main
- outputs from the agent's Tensorflow computation graph:
-
- =========== ================ ======================================
- Symbol Shape Description
- =========== ================ ======================================
- ``pi`` (batch, act_dim) | Samples actions from policy given
- | states.
- ``logp`` (batch,) | Gives log probability, according to
- | the policy, of taking actions ``a_ph``
- | in states ``x_ph``.
- ``logp_pi`` (batch,) | Gives log probability, according to
- | the policy, of the action sampled by
- | ``pi``.
- ``v`` (batch,) | Gives the value estimate for states
- | in ``x_ph``. (Critical: make sure
- | to flatten this!)
- =========== ================ ======================================
-
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic
- function you provided to VPG.
-
- seed (int): Seed for random number generators.
-
- steps_per_epoch (int): Number of steps of interaction (state-action pairs)
- for the agent and the environment in each epoch.
-
- epochs (int): Number of epochs of interaction (equivalent to
- number of policy updates) to perform.
-
- gamma (float): Discount factor. (Always between 0 and 1.)
-
- pi_lr (float): Learning rate for policy optimizer.
-
- vf_lr (float): Learning rate for value function optimizer.
-
- train_v_iters (int): Number of gradient descent steps to take on
- value function per epoch.
-
- lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
- close to 1.)
-
- max_ep_len (int): Maximum length of trajectory / episode / rollout.
-
- logger_kwargs (dict): Keyword args for EpochLogger.
-
- save_freq (int): How often (in terms of gap between epochs) to save
- the current policy and value function.
-
- """
-
- logger=EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- seed+=10000*proc_id()
- tf.set_random_seed(seed)
- np.random.seed(seed)
-
- env=env_fn()
- obs_dim=env.observation_space.shape
- act_dim=env.action_space.shape
-
- # Share information about action space with policy architecture
- ac_kwargs['action_space']=env.action_space
-
- # Inputs to computation graph
- x_ph,a_ph=core.placeholders_from_spaces(env.observation_space,env.action_space)
- adv_ph,ret_ph,logp_old_ph=core.placeholders(None,None,None)
-
- # Main outputs from computation graph
- pi,logp,logp_pi,v=actor_critic(x_ph,a_ph,**ac_kwargs)
-
- # Need all placeholders in *this* order later (to zip with data from buffer)
- all_phs=[x_ph,a_ph,adv_ph,ret_ph,logp_old_ph]
-
- # Every step, get: action, value, and logprob
- get_action_ops=[pi,v,logp_pi]
-
- # Experience buffer
- local_steps_per_epoch=int(steps_per_epoch/num_procs())
- buf=VPGBuffer(obs_dim,act_dim,local_steps_per_epoch,gamma,lam)
-
- # Count variables
- var_counts=tuple(core.count_vars(scope)forscopein['pi','v'])
- logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)
-
- # VPG objectives
- pi_loss=-tf.reduce_mean(logp*adv_ph)
- v_loss=tf.reduce_mean((ret_ph-v)**2)
-
- # Info (useful to watch during learning)
- approx_kl=tf.reduce_mean(logp_old_ph-logp)# a sample estimate for KL-divergence, easy to compute
- approx_ent=tf.reduce_mean(-logp)# a sample estimate for entropy, also easy to compute
-
- # Optimizers
- train_pi=MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
- train_v=MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
-
- sess=tf.Session()
- sess.run(tf.global_variables_initializer())
-
- # Sync params across processes
- sess.run(sync_all_params())
-
- # Setup model saving
- logger.setup_tf_saver(sess,inputs={'x':x_ph},outputs={'pi':pi,'v':v})
-
- defupdate():
- inputs={k:vfork,vinzip(all_phs,buf.get())}
- pi_l_old,v_l_old,ent=sess.run([pi_loss,v_loss,approx_ent],feed_dict=inputs)
-
- # Policy gradient step
- sess.run(train_pi,feed_dict=inputs)
-
- # Value function learning
- for_inrange(train_v_iters):
- sess.run(train_v,feed_dict=inputs)
-
- # Log changes from update
- pi_l_new,v_l_new,kl=sess.run([pi_loss,v_loss,approx_kl],feed_dict=inputs)
- logger.store(LossPi=pi_l_old,LossV=v_l_old,
- KL=kl,Entropy=ent,
- DeltaLossPi=(pi_l_new-pi_l_old),
- DeltaLossV=(v_l_new-v_l_old))
-
- start_time=time.time()
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Main loop: collect experience in env and update/log each epoch
- forepochinrange(epochs):
- fortinrange(local_steps_per_epoch):
- a,v_t,logp_t=sess.run(get_action_ops,feed_dict={x_ph:o.reshape(1,-1)})
-
- # save and log
- buf.store(o,a,r,v_t,logp_t)
- logger.store(VVals=v_t)
-
- o,r,d,_=env.step(a[0])
- ep_ret+=r
- ep_len+=1
-
- terminal=dor(ep_len==max_ep_len)
- ifterminalor(t==local_steps_per_epoch-1):
- ifnot(terminal):
- print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
- # if trajectory didn't reach terminal state, bootstrap value target
- last_val=rifdelsesess.run(v,feed_dict={x_ph:o.reshape(1,-1)})
- buf.finish_path(last_val)
- ifterminal:
- # only save EpRet / EpLen if trajectory finished
- logger.store(EpRet=ep_ret,EpLen=ep_len)
- o,r,d,ep_ret,ep_len=env.reset(),0,False,0,0
-
- # Save model
- if(epoch%save_freq==0)or(epoch==epochs-1):
- logger.save_state({'env':env},None)
-
- # Perform VPG update!
- update()
-
- # Log info about epoch
- logger.log_tabular('Epoch',epoch)
- logger.log_tabular('EpRet',with_min_and_max=True)
- logger.log_tabular('EpLen',average_only=True)
- logger.log_tabular('VVals',with_min_and_max=True)
- logger.log_tabular('TotalEnvInteracts',(epoch+1)*steps_per_epoch)
- logger.log_tabular('LossPi',average_only=True)
- logger.log_tabular('LossV',average_only=True)
- logger.log_tabular('DeltaLossPi',average_only=True)
- logger.log_tabular('DeltaLossV',average_only=True)
- logger.log_tabular('Entropy',average_only=True)
- logger.log_tabular('KL',average_only=True)
- logger.log_tabular('Time',time.time()-start_time)
- logger.dump_tabular()
-"""
-
-Some simple logging functionality, inspired by rllab's logging.
-
-Logs to a tab-separated-values file (path/to/output_directory/progress.txt)
-
-"""
-importjson
-importjoblib
-importshutil
-importnumpyasnp
-importtensorflowastf
-importos.pathasosp,time,atexit,os
-fromspinup.utils.mpi_toolsimportproc_id,mpi_statistics_scalar
-fromspinup.utils.serialization_utilsimportconvert_json
-
-color2num=dict(
- gray=30,
- red=31,
- green=32,
- yellow=33,
- blue=34,
- magenta=35,
- cyan=36,
- white=37,
- crimson=38
-)
-
-defcolorize(string,color,bold=False,highlight=False):
- """
- Colorize a string.
-
- This function was originally written by John Schulman.
- """
- attr=[]
- num=color2num[color]
- ifhighlight:num+=10
- attr.append(str(num))
- ifbold:attr.append('1')
- return'\x1b[%sm%s\x1b[0m'%(';'.join(attr),string)
-
-
[docs]defrestore_tf_graph(sess,fpath):
- """
- Loads graphs saved by Logger.
-
- Will output a dictionary whose keys and values are from the 'inputs'
- and 'outputs' dict you specified with logger.setup_tf_saver().
-
- Args:
- sess: A Tensorflow session.
- fpath: Filepath to save directory.
-
- Returns:
- A dictionary mapping from keys to tensors in the computation graph
- loaded from ``fpath``.
- """
- tf.saved_model.loader.load(
- sess,
- [tf.saved_model.tag_constants.SERVING],
- fpath
- )
- model_info=joblib.load(osp.join(fpath,'model_info.pkl'))
- graph=tf.get_default_graph()
- model=dict()
- model.update({k:graph.get_tensor_by_name(v)fork,vinmodel_info['inputs'].items()})
- model.update({k:graph.get_tensor_by_name(v)fork,vinmodel_info['outputs'].items()})
- returnmodel
-
-
[docs]classLogger:
- """
- A general-purpose logger.
-
- Makes it easy to save diagnostics, hyperparameter configurations, the
- state of a training run, and the trained model.
- """
-
-
[docs]def__init__(self,output_dir=None,output_fname='progress.txt',exp_name=None):
- """
- Initialize a Logger.
-
- Args:
- output_dir (string): A directory for saving results to. If
- ``None``, defaults to a temp directory of the form
- ``/tmp/experiments/somerandomnumber``.
-
- output_fname (string): Name for the tab-separated-value file
- containing metrics logged throughout a training run.
- Defaults to ``progress.txt``.
-
- exp_name (string): Experiment name. If you run multiple training
- runs and give them all the same ``exp_name``, the plotter
- will know to group them. (Use case: if you run the same
- hyperparameter configuration with multiple random seeds, you
- should give them all the same ``exp_name``.)
- """
- ifproc_id()==0:
- self.output_dir=output_diror"/tmp/experiments/%i"%int(time.time())
- ifosp.exists(self.output_dir):
- print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir)
- else:
- os.makedirs(self.output_dir)
- self.output_file=open(osp.join(self.output_dir,output_fname),'w')
- atexit.register(self.output_file.close)
- print(colorize("Logging data to %s"%self.output_file.name,'green',bold=True))
- else:
- self.output_dir=None
- self.output_file=None
- self.first_row=True
- self.log_headers=[]
- self.log_current_row={}
- self.exp_name=exp_name
-
-
[docs]deflog(self,msg,color='green'):
- """Print a colorized message to stdout."""
- ifproc_id()==0:
- print(colorize(msg,color,bold=True))
-
-
[docs]deflog_tabular(self,key,val):
- """
- Log a value of some diagnostic.
-
- Call this only once for each diagnostic quantity, each iteration.
- After using ``log_tabular`` to store values for each diagnostic,
- make sure to call ``dump_tabular`` to write them out to file and
- stdout (otherwise they will not get saved anywhere).
- """
- ifself.first_row:
- self.log_headers.append(key)
- else:
- assertkeyinself.log_headers,"Trying to introduce a new key %s that you didn't include in the first iteration"%key
- assertkeynotinself.log_current_row,"You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
- self.log_current_row[key]=val
-
-
[docs]defsave_config(self,config):
- """
- Log an experiment configuration.
-
- Call this once at the top of your experiment, passing in all important
- config vars as a dict. This will serialize the config to JSON, while
- handling anything which can't be serialized in a graceful way (writing
- as informative a string as possible).
-
- Example use:
-
- .. code-block:: python
-
- logger = EpochLogger(**logger_kwargs)
- logger.save_config(locals())
- """
- config_json=convert_json(config)
- ifself.exp_nameisnotNone:
- config_json['exp_name']=self.exp_name
- ifproc_id()==0:
- output=json.dumps(config_json,separators=(',',':\t'),indent=4,sort_keys=True)
- print(colorize('Saving config:\n',color='cyan',bold=True))
- print(output)
- withopen(osp.join(self.output_dir,"config.json"),'w')asout:
- out.write(output)
-
-
[docs]defsave_state(self,state_dict,itr=None):
- """
- Saves the state of an experiment.
-
- To be clear: this is about saving *state*, not logging diagnostics.
- All diagnostic logging is separate from this function. This function
- will save whatever is in ``state_dict``---usually just a copy of the
- environment---and the most recent parameters for the model you
- previously set up saving for with ``setup_tf_saver``.
-
- Call with any frequency you prefer. If you only want to maintain a
- single state and overwrite it at each call with the most recent
- version, leave ``itr=None``. If you want to keep all of the states you
- save, provide unique (increasing) values for 'itr'.
-
- Args:
- state_dict (dict): Dictionary containing essential elements to
- describe the current state of training.
-
- itr: An int, or None. Current iteration of training.
- """
- ifproc_id()==0:
- fname='vars.pkl'ifitrisNoneelse'vars%d.pkl'%itr
- try:
- joblib.dump(state_dict,osp.join(self.output_dir,fname))
- except:
- self.log('Warning: could not pickle state_dict.',color='red')
- ifhasattr(self,'tf_saver_elements'):
- self._tf_simple_save(itr)
-
-
[docs]defsetup_tf_saver(self,sess,inputs,outputs):
- """
- Set up easy model saving for tensorflow.
-
- Call once, after defining your computation graph but before training.
-
- Args:
- sess: The Tensorflow session in which you train your computation
- graph.
-
- inputs (dict): A dictionary that maps from keys of your choice
- to the tensorflow placeholders that serve as inputs to the
- computation graph. Make sure that *all* of the placeholders
- needed for your outputs are included!
-
- outputs (dict): A dictionary that maps from keys of your choice
- to the outputs from your computation graph.
- """
- self.tf_saver_elements=dict(session=sess,inputs=inputs,outputs=outputs)
- self.tf_saver_info={'inputs':{k:v.namefork,vininputs.items()},
- 'outputs':{k:v.namefork,vinoutputs.items()}}
-
- def_tf_simple_save(self,itr=None):
- """
- Uses simple_save to save a trained model, plus info to make it easy
- to associated tensors to variables after restore.
- """
- ifproc_id()==0:
- asserthasattr(self,'tf_saver_elements'), \
- "First have to setup saving with self.setup_tf_saver"
- fpath='simple_save'+('%d'%itrifitrisnotNoneelse'')
- fpath=osp.join(self.output_dir,fpath)
- ifosp.exists(fpath):
- # simple_save refuses to be useful if fpath already exists,
- # so just delete fpath if it's there.
- shutil.rmtree(fpath)
- tf.saved_model.simple_save(export_dir=fpath,**self.tf_saver_elements)
- joblib.dump(self.tf_saver_info,osp.join(fpath,'model_info.pkl'))
-
-
[docs]defdump_tabular(self):
- """
- Write all of the diagnostics from the current iteration.
-
- Writes both to stdout, and to the output file.
- """
- ifproc_id()==0:
- vals=[]
- key_lens=[len(key)forkeyinself.log_headers]
- max_key_len=max(15,max(key_lens))
- keystr='%'+'%d'%max_key_len
- fmt="| "+keystr+"s | %15s |"
- n_slashes=22+max_key_len
- print("-"*n_slashes)
- forkeyinself.log_headers:
- val=self.log_current_row.get(key,"")
- valstr="%8.3g"%valifhasattr(val,"__float__")elseval
- print(fmt%(key,valstr))
- vals.append(val)
- print("-"*n_slashes)
- ifself.output_fileisnotNone:
- ifself.first_row:
- self.output_file.write("\t".join(self.log_headers)+"\n")
- self.output_file.write("\t".join(map(str,vals))+"\n")
- self.output_file.flush()
- self.log_current_row.clear()
- self.first_row=False
-
-
[docs]classEpochLogger(Logger):
- """
- A variant of Logger tailored for tracking average values over epochs.
-
- Typical use case: there is some quantity which is calculated many times
- throughout an epoch, and at the end of the epoch, you would like to
- report the average / std / min / max value of that quantity.
-
- With an EpochLogger, each time the quantity is calculated, you would
- use
-
- .. code-block:: python
-
- epoch_logger.store(NameOfQuantity=quantity_value)
-
- to load it into the EpochLogger's state. Then at the end of the epoch, you
- would use
-
- .. code-block:: python
-
- epoch_logger.log_tabular(NameOfQuantity, **options)
-
- to record the desired values.
- """
-
- def__init__(self,*args,**kwargs):
- super().__init__(*args,**kwargs)
- self.epoch_dict=dict()
-
-
[docs]defstore(self,**kwargs):
- """
- Save something into the epoch_logger's current state.
-
- Provide an arbitrary number of keyword arguments with numerical
- values.
- """
- fork,vinkwargs.items():
- ifnot(kinself.epoch_dict.keys()):
- self.epoch_dict[k]=[]
- self.epoch_dict[k].append(v)
-
-
[docs]deflog_tabular(self,key,val=None,with_min_and_max=False,average_only=False):
- """
- Log a value or possibly the mean/std/min/max values of a diagnostic.
-
- Args:
- key (string): The name of the diagnostic. If you are logging a
- diagnostic whose state has previously been saved with
- ``store``, the key here has to match the key you used there.
-
- val: A value for the diagnostic. If you have previously saved
- values for this key via ``store``, do *not* provide a ``val``
- here.
-
- with_min_and_max (bool): If true, log min and max values of the
- diagnostic over the epoch.
-
- average_only (bool): If true, do not log the standard deviation
- of the diagnostic over the epoch.
- """
- ifvalisnotNone:
- super().log_tabular(key,val)
- else:
- v=self.epoch_dict[key]
- vals=np.concatenate(v)ifisinstance(v[0],np.ndarray)andlen(v[0].shape)>0elsev
- stats=mpi_statistics_scalar(vals,with_min_and_max=with_min_and_max)
- super().log_tabular(keyifaverage_onlyelse'Average'+key,stats[0])
- ifnot(average_only):
- super().log_tabular('Std'+key,stats[1])
- ifwith_min_and_max:
- super().log_tabular('Max'+key,stats[3])
- super().log_tabular('Min'+key,stats[2])
- self.epoch_dict[key]=[]
-
-
[docs]defget_stats(self,key):
- """
- Lets an algorithm ask the logger for mean/std/min/max of a diagnostic.
- """
- v=self.epoch_dict[key]
- vals=np.concatenate(v)ifisinstance(v[0],np.ndarray)andlen(v[0].shape)>0elsev
- returnmpi_statistics_scalar(vals)
-importnumpyasnp
-importtensorflowastf
-frommpi4pyimportMPI
-fromspinup.utils.mpi_toolsimportbroadcast
-
-
-defflat_concat(xs):
- returntf.concat([tf.reshape(x,(-1,))forxinxs],axis=0)
-
-defassign_params_from_flat(x,params):
- flat_size=lambdap:int(np.prod(p.shape.as_list()))# the 'int' is important for scalars
- splits=tf.split(x,[flat_size(p)forpinparams])
- new_params=[tf.reshape(p_new,p.shape)forp,p_newinzip(params,splits)]
- returntf.group([tf.assign(p,p_new)forp,p_newinzip(params,new_params)])
-
-defsync_params(params):
- get_params=flat_concat(params)
- def_broadcast(x):
- broadcast(x)
- returnx
- synced_params=tf.py_func(_broadcast,[get_params],tf.float32)
- returnassign_params_from_flat(synced_params,params)
-
-
[docs]defsync_all_params():
- """Sync all tf variables across MPI processes."""
- returnsync_params(tf.global_variables())
-
-
-
[docs]classMpiAdamOptimizer(tf.train.AdamOptimizer):
- """
- Adam optimizer that averages gradients across MPI processes.
-
- The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_.
- For documentation on method arguments, see the Tensorflow docs page for
- the base `AdamOptimizer`_.
-
- .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
- .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
- """
-
- def__init__(self,**kwargs):
- self.comm=MPI.COMM_WORLD
- tf.train.AdamOptimizer.__init__(self,**kwargs)
-
-
[docs]defmpi_fork(n,bind_to_core=False):
- """
- Re-launches the current script with workers linked by MPI.
-
- Also, terminates the original process that launched it.
-
- Taken almost without modification from the Baselines function of the
- `same name`_.
-
- .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
-
- Args:
- n (int): Number of process to split into.
-
- bind_to_core (bool): Bind each MPI process to a core.
- """
- ifn<=1:
- return
- ifos.getenv("IN_MPI")isNone:
- env=os.environ.copy()
- env.update(
- MKL_NUM_THREADS="1",
- OMP_NUM_THREADS="1",
- IN_MPI="1"
- )
- args=["mpirun","-np",str(n)]
- ifbind_to_core:
- args+=["-bind-to","core"]
- args+=[sys.executable]+sys.argv
- subprocess.check_call(args,env=env)
- sys.exit()
[docs]defmpi_avg(x):
- """Average a scalar or vector over MPI processes."""
- returnmpi_sum(x)/num_procs()
-
-
[docs]defmpi_statistics_scalar(x,with_min_and_max=False):
- """
- Get mean/std and optional min/max of scalar x across MPI processes.
-
- Args:
- x: An array containing samples of the scalar to produce statistics
- for.
-
- with_min_and_max (bool): If true, return min and max of x in
- addition to mean and std.
- """
- x=np.array(x,dtype=np.float32)
- global_sum,global_n=mpi_sum([np.sum(x),len(x)])
- mean=global_sum/global_n
-
- global_sum_sq=mpi_sum(np.sum((x-mean)**2))
- std=np.sqrt(global_sum_sq/global_n)# compute global std
-
- ifwith_min_and_max:
- global_min=mpi_op(np.min(x)iflen(x)>0elsenp.inf,op=MPI.MIN)
- global_max=mpi_op(np.max(x)iflen(x)>0else-np.inf,op=MPI.MAX)
- returnmean,std,global_min,global_max
- returnmean,std
[docs]defsetup_logger_kwargs(exp_name,seed=None,data_dir=None,datestamp=False):
- """
- Sets up the output_dir for a logger and returns a dict for logger kwargs.
-
- If no seed is given and datestamp is false,
-
- ::
-
- output_dir = data_dir/exp_name
-
- If a seed is given and datestamp is false,
-
- ::
-
- output_dir = data_dir/exp_name/exp_name_s[seed]
-
- If datestamp is true, amend to
-
- ::
-
- output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed]
-
- You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in
- ``spinup/user_config.py``.
-
- Args:
-
- exp_name (string): Name for experiment.
-
- seed (int): Seed for random number generators used by experiment.
-
- data_dir (string): Path to folder where results should be saved.
- Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
-
- datestamp (bool): Whether to include a date and timestamp in the
- name of the save directory.
-
- Returns:
-
- logger_kwargs, a dict containing output_dir and exp_name.
- """
-
- # Datestamp forcing
- datestamp=datestamporFORCE_DATESTAMP
-
- # Make base path
- ymd_time=time.strftime("%Y-%m-%d_")ifdatestampelse''
- relpath=''.join([ymd_time,exp_name])
-
- ifseedisnotNone:
- # Make a seed-specific subfolder in the experiment directory.
- ifdatestamp:
- hms_time=time.strftime("%Y-%m-%d_%H-%M-%S")
- subfolder=''.join([hms_time,'-',exp_name,'_s',str(seed)])
- else:
- subfolder=''.join([exp_name,'_s',str(seed)])
- relpath=osp.join(relpath,subfolder)
-
- data_dir=data_dirorDEFAULT_DATA_DIR
- logger_kwargs=dict(output_dir=osp.join(data_dir,relpath),
- exp_name=exp_name)
- returnlogger_kwargs
-
-
-
[docs]defcall_experiment(exp_name,thunk,seed=0,num_cpu=1,data_dir=None,
- datestamp=False,**kwargs):
- """
- Run a function (thunk) with hyperparameters (kwargs), plus configuration.
-
- This wraps a few pieces of functionality which are useful when you want
- to run many experiments in sequence, including logger configuration and
- splitting into multiple processes for MPI.
-
- There's also a SpinningUp-specific convenience added into executing the
- thunk: if ``env_name`` is one of the kwargs passed to call_experiment, it's
- assumed that the thunk accepts an argument called ``env_fn``, and that
- the ``env_fn`` should make a gym environment with the given ``env_name``.
-
- The way the experiment is actually executed is slightly complicated: the
- function is serialized to a string, and then ``run_entrypoint.py`` is
- executed in a subprocess call with the serialized string as an argument.
- ``run_entrypoint.py`` unserializes the function call and executes it.
- We choose to do it this way---instead of just calling the function
- directly here---to avoid leaking state between successive experiments.
-
- Args:
-
- exp_name (string): Name for experiment.
-
- thunk (callable): A python function.
-
- seed (int): Seed for random number generators.
-
- num_cpu (int): Number of MPI processes to split into. Also accepts
- 'auto', which will set up as many procs as there are cpus on
- the machine.
-
- data_dir (string): Used in configuring the logger, to decide where
- to store experiment results. Note: if left as None, data_dir will
- default to ``DEFAULT_DATA_DIR`` from ``spinup/user_config.py``.
-
- **kwargs: All kwargs to pass to thunk.
-
- """
-
- # Determine number of CPU cores to run on
- num_cpu=psutil.cpu_count(logical=False)ifnum_cpu=='auto'elsenum_cpu
-
- # Send random seed to thunk
- kwargs['seed']=seed
-
- # Be friendly and print out your kwargs, so we all know what's up
- print(colorize('Running experiment:\n',color='cyan',bold=True))
- print(exp_name+'\n')
- print(colorize('with kwargs:\n',color='cyan',bold=True))
- kwargs_json=convert_json(kwargs)
- print(json.dumps(kwargs_json,separators=(',',':\t'),indent=4,sort_keys=True))
- print('\n')
-
- # Set up logger output directory
- if'logger_kwargs'notinkwargs:
- kwargs['logger_kwargs']=setup_logger_kwargs(exp_name,seed,data_dir,datestamp)
- else:
- print('Note: Call experiment is not handling logger_kwargs.\n')
-
- defthunk_plus():
- # Make 'env_fn' from 'env_name'
- if'env_name'inkwargs:
- importgym
- env_name=kwargs['env_name']
- kwargs['env_fn']=lambda:gym.make(env_name)
- delkwargs['env_name']
-
- # Fork into multiple processes
- mpi_fork(num_cpu)
-
- # Run thunk
- thunk(**kwargs)
-
- # Prepare to launch a script to run the experiment
- pickled_thunk=cloudpickle.dumps(thunk_plus)
- encoded_thunk=base64.b64encode(zlib.compress(pickled_thunk)).decode('utf-8')
-
- entrypoint=osp.join(osp.abspath(osp.dirname(__file__)),'run_entrypoint.py')
- cmd=[sys.executableifsys.executableelse'python',entrypoint,encoded_thunk]
- try:
- subprocess.check_call(cmd,env=os.environ)
- exceptCalledProcessError:
- err_msg='\n'*3+'='*DIV_LINE_WIDTH+'\n'+dedent("""
-
- There appears to have been an error in your experiment.
-
- Check the traceback above to see what actually went wrong. The
- traceback below, included for completeness (but probably not useful
- for diagnosing the error), shows the stack leading up to the
- experiment launch.
-
- """)+'='*DIV_LINE_WIDTH+'\n'*3
- print(err_msg)
- raise
-
- # Tell the user about where results are, and how to check them
- logger_kwargs=kwargs['logger_kwargs']
-
- plot_cmd='python -m spinup.run plot '+logger_kwargs['output_dir']
- plot_cmd=colorize(plot_cmd,'green')
-
- test_cmd='python -m spinup.run test_policy '+logger_kwargs['output_dir']
- test_cmd=colorize(test_cmd,'green')
-
- output_msg='\n'*5+'='*DIV_LINE_WIDTH+'\n'+dedent("""\
- End of experiment.
-
-
- Plot results from this run with:
-
-%s
-
-
- Watch the trained agent with:
-
-%s
-
-
- """%(plot_cmd,test_cmd))+'='*DIV_LINE_WIDTH+'\n'*5
-
- print(output_msg)
-
-
-defall_bools(vals):
- returnall([isinstance(v,bool)forvinvals])
-
-defvalid_str(v):
- """
- Convert a value or values to a string which could go in a filepath.
-
- Partly based on `this gist`_.
-
- .. _`this gist`: https://gist.github.com/seanh/93666
-
- """
- ifhasattr(v,'__name__'):
- returnvalid_str(v.__name__)
-
- ifisinstance(v,tuple)orisinstance(v,list):
- return'-'.join([valid_str(x)forxinv])
-
- # Valid characters are '-', '_', and alphanumeric. Replace invalid chars
- # with '-'.
- str_v=str(v).lower()
- valid_chars="-_%s%s"%(string.ascii_letters,string.digits)
- str_v=''.join(cifcinvalid_charselse'-'forcinstr_v)
- returnstr_v
-
-
-
[docs]classExperimentGrid:
- """
- Tool for running many experiments given hyperparameter ranges.
- """
-
- def__init__(self,name=''):
- self.keys=[]
- self.vals=[]
- self.shs=[]
- self.in_names=[]
- self.name(name)
-
- defname(self,_name):
- assertisinstance(_name,str),"Name has to be a string."
- self._name=_name
-
-
[docs]defprint(self):
- """Print a helpful report about the experiment grid."""
- print('='*DIV_LINE_WIDTH)
-
- # Prepare announcement at top of printing. If the ExperimentGrid has a
- # short name, write this as one line. If the name is long, break the
- # announcement over two lines.
- base_msg='ExperimentGrid %s runs over parameters:\n'
- name_insert='['+self._name+']'
- iflen(base_msg%name_insert)<=80:
- msg=base_msg%name_insert
- else:
- msg=base_msg%(name_insert+'\n')
- print(colorize(msg,color='green',bold=True))
-
- # List off parameters, shorthands, and possible values.
- fork,v,shinzip(self.keys,self.vals,self.shs):
- color_k=colorize(k.ljust(40),color='cyan',bold=True)
- print('',color_k,'['+sh+']'ifshisnotNoneelse'','\n')
- fori,valinenumerate(v):
- print('\t'+str(convert_json(val)))
- print()
-
- # Count up the number of variants. The number counting seeds
- # is the total number of experiments that will run; the number not
- # counting seeds is the total number of otherwise-unique configs
- # being investigated.
- nvars_total=int(np.prod([len(v)forvinself.vals]))
- if'seed'inself.keys:
- num_seeds=len(self.vals[self.keys.index('seed')])
- nvars_seedless=int(nvars_total/num_seeds)
- else:
- nvars_seedless=nvars_total
- print(' Variants, counting seeds: '.ljust(40),nvars_total)
- print(' Variants, not counting seeds: '.ljust(40),nvars_seedless)
- print()
- print('='*DIV_LINE_WIDTH)
-
-
- def_default_shorthand(self,key):
- # Create a default shorthand for the key, built from the first
- # three letters of each colon-separated part.
- # But if the first three letters contains something which isn't
- # alphanumeric, shear that off.
- valid_chars="%s%s"%(string.ascii_letters,string.digits)
- defshear(x):
- return''.join(zforzinx[:3]ifzinvalid_chars)
- sh='-'.join([shear(x)forxinkey.split(':')])
- returnsh
-
-
[docs]defadd(self,key,vals,shorthand=None,in_name=False):
- """
- Add a parameter (key) to the grid config, with potential values (vals).
-
- By default, if a shorthand isn't given, one is automatically generated
- from the key using the first three letters of each colon-separated
- term. To disable this behavior, change ``DEFAULT_SHORTHAND`` in the
- ``spinup/user_config.py`` file to ``False``.
-
- Args:
- key (string): Name of parameter.
-
- vals (value or list of values): Allowed values of parameter.
-
- shorthand (string): Optional, shortened name of parameter. For
- example, maybe the parameter ``steps_per_epoch`` is shortened
- to ``steps``.
-
- in_name (bool): When constructing variant names, force the
- inclusion of this parameter into the name.
- """
- assertisinstance(key,str),"Key must be a string."
- assertshorthandisNoneorisinstance(shorthand,str), \
- "Shorthand must be a string."
- ifnotisinstance(vals,list):
- vals=[vals]
- ifDEFAULT_SHORTHANDandshorthandisNone:
- shorthand=self._default_shorthand(key)
- self.keys.append(key)
- self.vals.append(vals)
- self.shs.append(shorthand)
- self.in_names.append(in_name)
-
-
[docs]defvariant_name(self,variant):
- """
- Given a variant (dict of valid param/value pairs), make an exp_name.
-
- A variant's name is constructed as the grid name (if you've given it
- one), plus param names (or shorthands if available) and values
- separated by underscores.
-
- Note: if ``seed`` is a parameter, it is not included in the name.
- """
-
- defget_val(v,k):
- # Utility method for getting the correct value out of a variant
- # given as a nested dict. Assumes that a parameter name, k,
- # describes a path into the nested dict, such that k='a:b:c'
- # corresponds to value=variant['a']['b']['c']. Uses recursion
- # to get this.
- ifkinv:
- returnv[k]
- else:
- splits=k.split(':')
- k0,k1=splits[0],':'.join(splits[1:])
- returnget_val(v[k0],k1)
-
- # Start the name off with the name of the variant generator.
- var_name=self._name
-
- # Build the rest of the name by looping through all parameters,
- # and deciding which ones need to go in there.
- fork,v,sh,inninzip(self.keys,self.vals,self.shs,self.in_names):
-
- # Include a parameter in a name if either 1) it can take multiple
- # values, or 2) the user specified that it must appear in the name.
- # Except, however, when the parameter is 'seed'. Seed is handled
- # differently so that runs of the same experiment, with different
- # seeds, will be grouped by experiment name.
- if(len(v)>1orinn)andnot(k=='seed'):
-
- # Use the shorthand if available, otherwise the full name.
- param_name=shifshisnotNoneelsek
- param_name=valid_str(param_name)
-
- # Get variant value for parameter k
- variant_val=get_val(variant,k)
-
- # Append to name
- ifall_bools(v):
- # If this is a param which only takes boolean values,
- # only include in the name if it's True for this variant.
- var_name+=('_'+param_name)ifvariant_valelse''
- else:
- var_name+='_'+param_name+valid_str(variant_val)
-
- returnvar_name.lstrip('_')
[docs]defvariants(self):
- """
- Makes a list of dicts, where each dict is a valid config in the grid.
-
- There is special handling for variant parameters whose names take
- the form
-
- ``'full:param:name'``.
-
- The colons are taken to indicate that these parameters should
- have a nested dict structure. eg, if there are two params,
-
- ==================== ===
- Key Val
- ==================== ===
- ``'base:param:one'`` 1
- ``'base:param:two'`` 2
- ==================== ===
-
- the variant dict will have the structure
-
- .. parsed-literal::
-
- variant = {
- base: {
- param : {
- a : 1,
- b : 2
- }
- }
- }
- """
- flat_variants=self._variants(self.keys,self.vals)
-
- defunflatten_var(var):
- """
- Build the full nested dict version of var, based on key names.
- """
- new_var=dict()
- unflatten_set=set()
-
- fork,vinvar.items():
- if':'ink:
- splits=k.split(':')
- k0=splits[0]
- assertk0notinnew_varorisinstance(new_var[k0],dict), \
- "You can't assign multiple values to the same key."
-
- ifnot(k0innew_var):
- new_var[k0]=dict()
-
- sub_k=':'.join(splits[1:])
- new_var[k0][sub_k]=v
- unflatten_set.add(k0)
- else:
- assertnot(kinnew_var), \
- "You can't assign multiple values to the same key."
- new_var[k]=v
-
- # Make sure to fill out the nested dicts.
- forkinunflatten_set:
- new_var[k]=unflatten_var(new_var[k])
-
- returnnew_var
-
- new_variants=[unflatten_var(var)forvarinflat_variants]
- returnnew_variants
-
-
[docs]defrun(self,thunk,num_cpu=1,data_dir=None,datestamp=False):
- """
- Run each variant in the grid with function 'thunk'.
-
- Note: 'thunk' must be either a callable function, or a string. If it is
- a string, it must be the name of a parameter whose values are all
- callable functions.
-
- Uses ``call_experiment`` to actually launch each experiment, and gives
- each variant a name using ``self.variant_name()``.
-
- Maintenance note: the args for ExperimentGrid.run should track closely
- to the args for call_experiment. However, ``seed`` is omitted because
- we presume the user may add it as a parameter in the grid.
- """
-
- # Print info about self.
- self.print()
-
- # Make the list of all variants.
- variants=self.variants()
-
- # Print variant names for the user.
- var_names=set([self.variant_name(var)forvarinvariants])
- var_names=sorted(list(var_names))
- line='='*DIV_LINE_WIDTH
- preparing=colorize('Preparing to run the following experiments...',
- color='green',bold=True)
- joined_var_names='\n'.join(var_names)
- announcement=f"\n{preparing}\n\n{joined_var_names}\n\n{line}"
- print(announcement)
-
-
- ifWAIT_BEFORE_LAUNCH>0:
- delay_msg=colorize(dedent("""
- Launch delayed to give you a few seconds to review your experiments.
-
- To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in
- spinup/user_config.py.
-
- """),color='cyan',bold=True)+line
- print(delay_msg)
- wait,steps=WAIT_BEFORE_LAUNCH,100
- prog_bar=trange(steps,desc='Launching in...',
- leave=False,ncols=DIV_LINE_WIDTH,
- mininterval=0.25,
- bar_format='{desc}: {bar}| {remaining}{elapsed}')
- for_inprog_bar:
- time.sleep(wait/steps)
-
- # Run the variants.
- forvarinvariants:
- exp_name=self.variant_name(var)
-
- # Figure out what the thunk is.
- ifisinstance(thunk,str):
- # Assume one of the variant parameters has the same
- # name as the string you passed for thunk, and that
- # variant[thunk] is a valid callable function.
- thunk_=var[thunk]
- delvar[thunk]
- else:
- # Assume thunk is given as a function.
- thunk_=thunk
-
- call_experiment(exp_name,thunk_,num_cpu=num_cpu,
- data_dir=data_dir,datestamp=datestamp,**var)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/_build/html/_sources/algorithms/ddpg.rst.txt b/docs/_build/html/_sources/algorithms/ddpg.rst.txt
deleted file mode 100644
index 8c61759a0..000000000
--- a/docs/_build/html/_sources/algorithms/ddpg.rst.txt
+++ /dev/null
@@ -1,233 +0,0 @@
-==================================
-Deep Deterministic Policy Gradient
-==================================
-
-.. contents:: Table of Contents
-
-Background
-==========
-
-(Previously: `Introduction to RL Part 1: The Optimal Q-Function and the Optimal Action`_)
-
-.. _`Introduction to RL Part 1: The Optimal Q-Function and the Optimal Action`: ../spinningup/rl_intro.html#the-optimal-q-function-and-the-optimal-action
-
-Deep Deterministic Policy Gradient (DDPG) is an algorithm which concurrently learns a Q-function and a policy. It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy.
-
-This approach is closely connected to Q-learning, and is motivated the same way: if you know the optimal action-value function :math:`Q^*(s,a)`, then in any given state, the optimal action :math:`a^*(s)` can be found by solving
-
-.. math::
-
- a^*(s) = \arg \max_a Q^*(s,a).
-
-DDPG interleaves learning an approximator to :math:`Q^*(s,a)` with learning an approximator to :math:`a^*(s)`, and it does so in a way which is specifically adapted for environments with continuous action spaces. But what does it mean that DDPG is adapted *specifically* for environments with continuous action spaces? It relates to how we compute the max over actions in :math:`\max_a Q^*(s,a)`.
-
-When there are a finite number of discrete actions, the max poses no problem, because we can just compute the Q-values for each action separately and directly compare them. (This also immediately gives us the action which maximizes the Q-value.) But when the action space is continuous, we can't exhaustively evaluate the space, and solving the optimization problem is highly non-trivial. Using a normal optimization algorithm would make calculating :math:`\max_a Q^*(s,a)` a painfully expensive subroutine. And since it would need to be run every time the agent wants to take an action in the environment, this is unacceptable.
-
-Because the action space is continuous, the function :math:`Q^*(s,a)` is presumed to be differentiable with respect to the action argument. This allows us to set up an efficient, gradient-based learning rule for a policy :math:`\mu(s)` which exploits that fact. Then, instead of running an expensive optimization subroutine each time we wish to compute :math:`\max_a Q(s,a)`, we can approximate it with :math:`\max_a Q(s,a) \approx Q(s,\mu(s))`. See the Key Equations section details.
-
-
-Quick Facts
------------
-
-* DDPG is an off-policy algorithm.
-* DDPG can only be used for environments with continuous action spaces.
-* DDPG can be thought of as being deep Q-learning for continuous action spaces.
-* The Spinning Up implementation of DDPG does not support parallelization.
-
-Key Equations
--------------
-
-Here, we'll explain the math behind the two parts of DDPG: learning a Q function, and learning a policy.
-
-The Q-Learning Side of DDPG
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-First, let's recap the Bellman equation describing the optimal action-value function, :math:`Q^*(s,a)`. It's given by
-
-.. math::
-
- Q^*(s,a) = \underset{s' \sim P}{{\mathrm E}}\left[r(s,a) + \gamma \max_{a'} Q^*(s', a')\right]
-
-where :math:`s' \sim P` is shorthand for saying that the next state, :math:`s'`, is sampled by the environment from a distribution :math:`P(\cdot| s,a)`.
-
-This Bellman equation is the starting point for learning an approximator to :math:`Q^*(s,a)`. Suppose the approximator is a neural network :math:`Q_{\phi}(s,a)`, with parameters :math:`\phi`, and that we have collected a set :math:`{\mathcal D}` of transitions :math:`(s,a,r,s',d)` (where :math:`d` indicates whether state :math:`s'` is terminal). We can set up a **mean-squared Bellman error (MSBE)** function, which tells us roughly how closely :math:`Q_{\phi}` comes to satisfying the Bellman equation:
-
-.. math::
-
- L(\phi, {\mathcal D}) = \underset{(s,a,r,s',d) \sim {\mathcal D}}{{\mathrm E}}\left[
- \Bigg( Q_{\phi}(s,a) - \left(r + \gamma (1 - d) \max_{a'} Q_{\phi}(s',a') \right) \Bigg)^2
- \right]
-
-Here, in evaluating :math:`(1-d)`, we've used a Python convention of evaluating ``True`` to 1 and ``False`` to zero. Thus, when ``d==True``---which is to say, when :math:`s'` is a terminal state---the Q-function should show that the agent gets no additional rewards after the current state. (This choice of notation corresponds to what we later implement in code.)
-
-Q-learning algorithms for function approximators, such as DQN (and all its variants) and DDPG, are largely based on minimizing this MSBE loss function. There are two main tricks employed by all of them which are worth describing, and then a specific detail for DDPG.
-
-**Trick One: Replay Buffers.** All standard algorithms for training a deep neural network to approximate :math:`Q^*(s,a)` make use of an experience replay buffer. This is the set :math:`{\mathcal D}` of previous experiences. In order for the algorithm to have stable behavior, the replay buffer should be large enough to contain a wide range of experiences, but it may not always be good to keep everything. If you only use the very-most recent data, you will overfit to that and things will break; if you use too much experience, you may slow down your learning. This may take some tuning to get right.
-
-.. admonition:: You Should Know
-
- We've mentioned that DDPG is an off-policy algorithm: this is as good a point as any to highlight why and how. Observe that the replay buffer *should* contain old experiences, even though they might have been obtained using an outdated policy. Why are we able to use these at all? The reason is that the Bellman equation *doesn't care* which transition tuples are used, or how the actions were selected, or what happens after a given transition, because the optimal Q-function should satisfy the Bellman equation for *all* possible transitions. So any transitions that we've ever experienced are fair game when trying to fit a Q-function approximator via MSBE minimization.
-
-**Trick Two: Target Networks.** Q-learning algorithms make use of **target networks**. The term
-
-.. math::
-
- r + \gamma (1 - d) \max_{a'} Q_{\phi}(s',a')
-
-is called the **target**, because when we minimize the MSBE loss, we are trying to make the Q-function be more like this target. Problematically, the target depends on the same parameters we are trying to train: :math:`\phi`. This makes MSBE minimization unstable. The solution is to use a set of parameters which comes close to :math:`\phi`, but with a time delay---that is to say, a second network, called the target network, which lags the first. The parameters of the target network are denoted :math:`\phi_{\text{targ}}`.
-
-In DQN-based algorithms, the target network is just copied over from the main network every some-fixed-number of steps. In DDPG-style algorithms, the target network is updated once per main network update by polyak averaging:
-
-.. math::
-
- \phi_{\text{targ}} \leftarrow \rho \phi_{\text{targ}} + (1 - \rho) \phi,
-
-where :math:`\rho` is a hyperparameter between 0 and 1 (usually close to 1). (This hyperparameter is called ``polyak`` in our code).
-
-
-**DDPG Detail: Calculating the Max Over Actions in the Target.** As mentioned earlier: computing the maximum over actions in the target is a challenge in continuous action spaces. DDPG deals with this by using a **target policy network** to compute an action which approximately maximizes :math:`Q_{\phi_{\text{targ}}}`. The target policy network is found the same way as the target Q-function: by polyak averaging the policy parameters over the course of training.
-
-Putting it all together, Q-learning in DDPG is performed by minimizing the following MSBE loss with stochastic gradient descent:
-
-.. math::
-
- L(\phi, {\mathcal D}) = \underset{(s,a,r,s',d) \sim {\mathcal D}}{{\mathrm E}}\left[
- \Bigg( Q_{\phi}(s,a) - \left(r + \gamma (1 - d) Q_{\phi_{\text{targ}}}(s', \mu_{\theta_{\text{targ}}}(s')) \right) \Bigg)^2
- \right],
-
-where :math:`\mu_{\theta_{\text{targ}}}` is the target policy.
-
-
-The Policy Learning Side of DDPG
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Policy learning in DDPG is fairly simple. We want to learn a deterministic policy :math:`\mu_{\theta}(s)` which gives the action that maximizes :math:`Q_{\phi}(s,a)`. Because the action space is continuous, and we assume the Q-function is differentiable with respect to action, we can just perform gradient ascent (with respect to policy parameters only) to solve
-
-.. math::
-
- \max_{\theta} \underset{s \sim {\mathcal D}}{{\mathrm E}}\left[ Q_{\phi}(s, \mu_{\theta}(s)) \right].
-
-Note that the Q-function parameters are treated as constants here.
-
-
-
-Exploration vs. Exploitation
-----------------------------
-
-DDPG trains a deterministic policy in an off-policy way. Because the policy is deterministic, if the agent were to explore on-policy, in the beginning it would probably not try a wide enough variety of actions to find useful learning signals. To make DDPG policies explore better, we add noise to their actions at training time. The authors of the original DDPG paper recommended time-correlated `OU noise`_, but more recent results suggest that uncorrelated, mean-zero Gaussian noise works perfectly well. Since the latter is simpler, it is preferred. To facilitate getting higher-quality training data, you may reduce the scale of the noise over the course of training. (We do not do this in our implementation, and keep noise scale fixed throughout.)
-
-At test time, to see how well the policy exploits what it has learned, we do not add noise to the actions.
-
-.. _`OU noise`: https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
-
-.. admonition:: You Should Know
-
- Our DDPG implementation uses a trick to improve exploration at the start of training. For a fixed number of steps at the beginning (set with the ``start_steps`` keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal DDPG exploration.
-
-
-Pseudocode
-----------
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{Deep Deterministic Policy Gradient}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta$, Q-function parameters $\phi$, empty replay buffer $\mathcal{D}$
- \STATE Set target parameters equal to main parameters $\theta_{\text{targ}} \leftarrow \theta$, $\phi_{\text{targ}} \leftarrow \phi$
- \REPEAT
- \STATE Observe state $s$ and select action $a = \text{clip}(\mu_{\theta}(s) + \epsilon, a_{Low}, a_{High})$, where $\epsilon \sim \mathcal{N}$
- \STATE Execute $a$ in the environment
- \STATE Observe next state $s'$, reward $r$, and done signal $d$ to indicate whether $s'$ is terminal
- \STATE Store $(s,a,r,s',d)$ in replay buffer $\mathcal{D}$
- \STATE If $s'$ is terminal, reset environment state.
- \IF{it's time to update}
- \FOR{however many updates}
- \STATE Randomly sample a batch of transitions, $B = \{ (s,a,r,s',d) \}$ from $\mathcal{D}$
- \STATE Compute targets
- \begin{equation*}
- y(r,s',d) = r + \gamma (1-d) Q_{\phi_{\text{targ}}}(s', \mu_{\theta_{\text{targ}}}(s'))
- \end{equation*}
- \STATE Update Q-function by one step of gradient descent using
- \begin{equation*}
- \nabla_{\phi} \frac{1}{|B|}\sum_{(s,a,r,s',d) \in B} \left( Q_{\phi}(s,a) - y(r,s',d) \right)^2
- \end{equation*}
- \STATE Update policy by one step of gradient ascent using
- \begin{equation*}
- \nabla_{\theta} \frac{1}{|B|}\sum_{s \in B}Q_{\phi}(s, \mu_{\theta}(s))
- \end{equation*}
- \STATE Update target networks with
- \begin{align*}
- \phi_{\text{targ}} &\leftarrow \rho \phi_{\text{targ}} + (1-\rho) \phi \\
- \theta_{\text{targ}} &\leftarrow \rho \theta_{\text{targ}} + (1-\rho) \theta
- \end{align*}
- \ENDFOR
- \ENDIF
- \UNTIL{convergence}
- \end{algorithmic}
- \end{algorithm}
-
-
-Documentation
-=============
-
-.. autofunction:: spinup.ddpg
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``a`` Tensorflow placeholder for action input.
-``pi`` | Deterministically computes an action from the agent, conditioned
- | on states in ``x``.
-``q`` Gives action-value estimate for states in ``x`` and actions in ``a``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Deterministic Policy Gradient Algorithms`_, Silver et al. 2014
-- `Continuous Control With Deep Reinforcement Learning`_, Lillicrap et al. 2016
-
-.. _`Deterministic Policy Gradient Algorithms`: http://proceedings.mlr.press/v32/silver14.pdf
-.. _`Continuous Control With Deep Reinforcement Learning`: https://arxiv.org/abs/1509.02971
-
-
-Why These Papers?
------------------
-
-Silver 2014 is included because it establishes the theory underlying deterministic policy gradients (DPG). Lillicrap 2016 is included because it adapts the theoretically-grounded DPG algorithm to the deep RL setting, giving DDPG.
-
-
-
-Other Public Implementations
-----------------------------
-
-- Baselines_
-- rllab_
-- `rllib (Ray)`_
-- `TD3 release repo`_
-
-.. _Baselines: https://github.com/openai/baselines/tree/master/baselines/ddpg
-.. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/ddpg.py
-.. _`rllib (Ray)`: https://github.com/ray-project/ray/tree/master/python/ray/rllib/agents/ddpg
-.. _`TD3 release repo`: https://github.com/sfujim/TD3
diff --git a/docs/_build/html/_sources/algorithms/ppo.rst.txt b/docs/_build/html/_sources/algorithms/ppo.rst.txt
deleted file mode 100644
index 04fb3469b..000000000
--- a/docs/_build/html/_sources/algorithms/ppo.rst.txt
+++ /dev/null
@@ -1,208 +0,0 @@
-============================
-Proximal Policy Optimization
-============================
-
-.. contents:: Table of Contents
-
-
-Background
-==========
-
-
-(Previously: `Background for TRPO`_)
-
-.. _`Background for TRPO`: ../algorithms/trpo.html#background
-
-PPO is motivated by the same question as TRPO: how can we take the biggest possible improvement step on a policy using the data we currently have, without stepping so far that we accidentally cause performance collapse? Where TRPO tries to solve this problem with a complex second-order method, PPO is a family of first-order methods that use a few other tricks to keep new policies close to old. PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO.
-
-There are two primary variants of PPO: PPO-Penalty and PPO-Clip.
-
-**PPO-Penalty** approximately solves a KL-constrained update like TRPO, but penalizes the KL-divergence in the objective function instead of making it a hard constraint, and automatically adjusts the penalty coefficient over the course of training so that it's scaled appropriately.
-
-**PPO-Clip** doesn't have a KL-divergence term in the objective and doesn't have a constraint at all. Instead relies on specialized clipping in the objective function to remove incentives for the new policy to get far from the old policy.
-
-Here, we'll focus only on PPO-Clip (the primary variant used at OpenAI).
-
-Quick Facts
------------
-
-* PPO is an on-policy algorithm.
-* PPO can be used for environments with either discrete or continuous action spaces.
-* The Spinning Up implementation of PPO supports parallelization with MPI.
-
-Key Equations
--------------
-
-PPO-clip updates policies via
-
-.. math::
-
- \theta_{k+1} = \arg \max_{\theta} \underset{s,a \sim \pi_{\theta_k}}{{\mathrm E}}\left[
- L(s,a,\theta_k, \theta)\right],
-
-typically taking multiple steps of (usually minibatch) SGD to maximize the objective. Here :math:`L` is given by
-
-.. math::
-
- L(s,a,\theta_k,\theta) = \min\left(
- \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)} A^{\pi_{\theta_k}}(s,a), \;\;
- \text{clip}\left(\frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)}, 1 - \epsilon, 1+\epsilon \right) A^{\pi_{\theta_k}}(s,a)
- \right),
-
-in which :math:`\epsilon` is a (small) hyperparameter which roughly says how far away the new policy is allowed to go from the old.
-
-This is a pretty complex expression, and it's hard to tell at first glance what it's doing, or how it helps keep the new policy close to the old policy. As it turns out, there's a considerably simplified version [1]_ of this objective which is a bit easier to grapple with (and is also the version we implement in our code):
-
-.. math::
-
- L(s,a,\theta_k,\theta) = \min\left(
- \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)} A^{\pi_{\theta_k}}(s,a), \;\;
- g(\epsilon, A^{\pi_{\theta_k}}(s,a))
- \right),
-
-where
-
-.. math::
-
- g(\epsilon, A) = \left\{
- \begin{array}{ll}
- (1 + \epsilon) A & A \geq 0 \\
- (1 - \epsilon) A & A < 0.
- \end{array}
- \right.
-
-To figure out what intuition to take away from this, let's look at a single state-action pair :math:`(s,a)`, and think of cases.
-
-**Advantage is positive**: Suppose the advantage for that state-action pair is positive, in which case its contribution to the objective reduces to
-
-.. math::
-
- L(s,a,\theta_k,\theta) = \min\left(
- \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)}, (1 + \epsilon)
- \right) A^{\pi_{\theta_k}}(s,a).
-
-Because the advantage is positive, the objective will increase if the action becomes more likely---that is, if :math:`\pi_{\theta}(a|s)` increases. But the min in this term puts a limit to how *much* the objective can increase. Once :math:`\pi_{\theta}(a|s) > (1+\epsilon) \pi_{\theta_k}(a|s)`, the min kicks in and this term hits a ceiling of :math:`(1+\epsilon) A^{\pi_{\theta_k}}(s,a)`. Thus: *the new policy does not benefit by going far away from the old policy*.
-
-**Advantage is negative**: Suppose the advantage for that state-action pair is negative, in which case its contribution to the objective reduces to
-
-.. math::
-
- L(s,a,\theta_k,\theta) = \max\left(
- \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)}, (1 - \epsilon)
- \right) A^{\pi_{\theta_k}}(s,a).
-
-Because the advantage is negative, the objective will increase if the action becomes less likely---that is, if :math:`\pi_{\theta}(a|s)` decreases. But the max in this term puts a limit to how *much* the objective can increase. Once :math:`\pi_{\theta}(a|s) < (1-\epsilon) \pi_{\theta_k}(a|s)`, the max kicks in and this term hits a ceiling of :math:`(1-\epsilon) A^{\pi_{\theta_k}}(s,a)`. Thus, again: *the new policy does not benefit by going far away from the old policy*.
-
-What we have seen so far is that clipping serves as a regularizer by removing incentives for the policy to change dramatically, and the hyperparameter :math:`\epsilon` corresponds to how far away the new policy can go from the old while still profiting the objective.
-
-.. admonition:: You Should Know
-
- While this kind of clipping goes a long way towards ensuring reasonable policy updates, it is still possible to end up with a new policy which is too far from the old policy, and there are a bunch of tricks used by different PPO implementations to stave this off. In our implementation here, we use a particularly simple method: early stopping. If the mean KL-divergence of the new policy from the old grows beyond a threshold, we stop taking gradient steps.
-
- When you feel comfortable with the basic math and implementation details, it's worth checking out other implementations to see how they handle this issue!
-
-
-.. [1] See `this note`_ for a derivation of the simplified form of the PPO-Clip objective.
-
-
-.. _`this note`: https://drive.google.com/file/d/1PDzn9RPvaXjJFZkGeapMHbHGiWWW20Ey/view?usp=sharing
-
-
-Exploration vs. Exploitation
-----------------------------
-
-PPO trains a stochastic policy in an on-policy way. This means that it explores by sampling actions according to the latest version of its stochastic policy. The amount of randomness in action selection depends on both initial conditions and the training procedure. Over the course of training, the policy typically becomes progressively less random, as the update rule encourages it to exploit rewards that it has already found. This may cause the policy to get trapped in local optima.
-
-
-Pseudocode
-----------
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{PPO-Clip}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta_0$, initial value function parameters $\phi_0$
- \FOR{$k = 0,1,2,...$}
- \STATE Collect set of trajectories ${\mathcal D}_k = \{\tau_i\}$ by running policy $\pi_k = \pi(\theta_k)$ in the environment.
- \STATE Compute rewards-to-go $\hat{R}_t$.
- \STATE Compute advantage estimates, $\hat{A}_t$ (using any method of advantage estimation) based on the current value function $V_{\phi_k}$.
- \STATE Update the policy by maximizing the PPO-Clip objective:
- \begin{equation*}
- \theta_{k+1} = \arg \max_{\theta} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T \min\left(
- \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\theta_k}(a_t|s_t)} A^{\pi_{\theta_k}}(s_t,a_t), \;\;
- g(\epsilon, A^{\pi_{\theta_k}}(s_t,a_t))
- \right),
- \end{equation*}
- typically via stochastic gradient ascent with Adam.
- \STATE Fit value function by regression on mean-squared error:
- \begin{equation*}
- \phi_{k+1} = \arg \min_{\phi} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T\left( V_{\phi} (s_t) - \hat{R}_t \right)^2,
- \end{equation*}
- typically via some gradient descent algorithm.
- \ENDFOR
- \end{algorithmic}
- \end{algorithm}
-
-
-Documentation
-=============
-
-.. autofunction:: spinup.ppo
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``pi`` Samples an action from the agent, conditioned on states in ``x``.
-``v`` Gives value estimate for states in ``x``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Proximal Policy Optimization Algorithms`_, Schulman et al. 2017
-- `High Dimensional Continuous Control Using Generalized Advantage Estimation`_, Schulman et al. 2016
-- `Emergence of Locomotion Behaviours in Rich Environments`_, Heess et al. 2017
-
-.. _`Proximal Policy Optimization Algorithms`: https://arxiv.org/abs/1707.06347
-.. _`High Dimensional Continuous Control Using Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438
-.. _`Emergence of Locomotion Behaviours in Rich Environments`: https://arxiv.org/abs/1707.02286
-
-Why These Papers?
------------------
-
-Schulman 2017 is included because it is the original paper describing PPO. Schulman 2016 is included because our implementation of PPO makes use of Generalized Advantage Estimation for computing the policy gradient. Heess 2017 is included because it presents a large-scale empirical analysis of behaviors learned by PPO agents in complex environments (although it uses PPO-penalty instead of PPO-clip).
-
-
-
-Other Public Implementations
-----------------------------
-
-- Baselines_
-- ModularRL_ (Caution: this implements PPO-penalty instead of PPO-clip.)
-- rllab_ (Caution: this implements PPO-penalty instead of PPO-clip.)
-- `rllib (Ray)`_
-
-.. _Baselines: https://github.com/openai/baselines/tree/master/baselines/ppo2
-.. _ModularRL: https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
-.. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/ppo.py
-.. _`rllib (Ray)`: https://github.com/ray-project/ray/tree/master/python/ray/rllib/agents/ppo
\ No newline at end of file
diff --git a/docs/_build/html/_sources/algorithms/sac.rst.txt b/docs/_build/html/_sources/algorithms/sac.rst.txt
deleted file mode 100644
index 84010d1be..000000000
--- a/docs/_build/html/_sources/algorithms/sac.rst.txt
+++ /dev/null
@@ -1,264 +0,0 @@
-=================
-Soft Actor-Critic
-=================
-
-.. contents:: Table of Contents
-
-Background
-==========
-
-(Previously: `Background for TD3`_)
-
-.. _`Background for TD3`: ../algorithms/td3.html#background
-
-Soft Actor Critic (SAC) is an algorithm which optimizes a stochastic policy in an off-policy way, forming a bridge between stochastic policy optimization and DDPG-style approaches. It isn't a direct successor to TD3 (having been published roughly concurrently), but it incorporates the clipped double-Q trick, and due to the inherent stochasticity of the policy in SAC, it also winds up benefiting from something like target policy smoothing.
-
-A central feature of SAC is **entropy regularization.** The policy is trained to maximize a trade-off between expected return and `entropy`_, a measure of randomness in the policy. This has a close connection to the exploration-exploitation trade-off: increasing entropy results in more exploration, which can accelerate learning later on. It can also prevent the policy from prematurely converging to a bad local optimum.
-
-.. _`entropy`: https://en.wikipedia.org/wiki/Entropy_(information_theory)
-
-Quick Facts
------------
-
-* SAC is an off-policy algorithm.
-* The version of SAC implemented here can only be used for environments with continuous action spaces.
-* An alternate version of SAC, which slightly changes the policy update rule, can be implemented to handle discrete action spaces.
-* The Spinning Up implementation of SAC does not support parallelization.
-
-Key Equations
--------------
-
-To explain Soft Actor Critic, we first have to introduce the entropy-regularized reinforcement learning setting. In entropy-regularized RL, there are slightly-different equations for value functions.
-
-Entropy-Regularized Reinforcement Learning
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Entropy is a quantity which, roughly speaking, says how random a random variable is. If a coin is weighted so that it almost always comes up heads, it has low entropy; if it's evenly weighted and has a half chance of either outcome, it has high entropy.
-
-Let :math:`x` be a random variable with probability mass or density function :math:`P`. The entropy :math:`H` of :math:`x` is computed from its distribution :math:`P` according to
-
-.. math::
-
- H(P) = \underE{x \sim P}{-\log P(x)}.
-
-In entropy-regularized reinforcement learning, the agent gets a bonus reward at each time step proportional to the entropy of the policy at that timestep. This changes `the RL problem`_ to:
-
-.. math::
-
- \pi^* = \arg \max_{\pi} \underE{\tau \sim \pi}{ \sum_{t=0}^{\infty} \gamma^t \bigg( R(s_t, a_t, s_{t+1}) + \alpha H\left(\pi(\cdot|s_t)\right) \bigg)},
-
-where :math:`\alpha > 0` is the trade-off coefficient. (Note: we're assuming an infinite-horizon discounted setting here, and we'll do the same for the rest of this page.) We can now define the slightly-different value functions in this setting. :math:`V^{\pi}` is changed to include the entropy bonuses from every timestep:
-
-.. math::
-
- V^{\pi}(s) = \underE{\tau \sim \pi}{ \left. \sum_{t=0}^{\infty} \gamma^t \bigg( R(s_t, a_t, s_{t+1}) + \alpha H\left(\pi(\cdot|s_t)\right) \bigg) \right| s_0 = s}
-
-:math:`Q^{\pi}` is changed to include the entropy bonuses from every timestep *except the first*:
-
-.. math::
-
- Q^{\pi}(s,a) = \underE{\tau \sim \pi}{ \left. \sum_{t=0}^{\infty} \gamma^t R(s_t, a_t, s_{t+1}) + \alpha \sum_{t=1}^{\infty} \gamma^t H\left(\pi(\cdot|s_t)\right)\right| s_0 = s, a_0 = a}
-
-With these definitions, :math:`V^{\pi}` and :math:`Q^{\pi}` are connected by:
-
-.. math::
-
- V^{\pi}(s) = \underE{a \sim \pi}{Q^{\pi}(s,a)} + \alpha H\left(\pi(\cdot|s)\right)
-
-and the Bellman equation for :math:`Q^{\pi}` is
-
-.. math::
-
- Q^{\pi}(s,a) &= \underE{s' \sim P \\ a' \sim \pi}{R(s,a,s') + \gamma\left(Q^{\pi}(s',a') + \alpha H\left(\pi(\cdot|s')\right) \right)} \\
- &= \underE{s' \sim P}{R(s,a,s') + \gamma V^{\pi}(s')}.
-
-.. _`the RL problem`: ../spinningup/rl_intro.html#the-rl-problem
-
-.. admonition:: You Should Know
-
- The way we've set up the value functions in the entropy-regularized setting is a little bit arbitrary, and actually we could have done it differently (eg make :math:`Q^{\pi}` include the entropy bonus at the first timestep). The choice of definition may vary slightly across papers on the subject.
-
-
-Soft Actor-Critic
-^^^^^^^^^^^^^^^^^
-
-SAC concurrently learns a policy :math:`\pi_{\theta}`, two Q-functions :math:`Q_{\phi_1}, Q_{\phi_2}`, and a value function :math:`V_{\psi}`.
-
-**Learning Q.** The Q-functions are learned by MSBE minimization, using a **target value network** to form the Bellman backups. They both use the same target, like in TD3, and have loss functions:
-
-.. math::
-
- L(\phi_i, {\mathcal D}) = \underset{(s,a,r,s',d) \sim {\mathcal D}}{{\mathrm E}}\left[
- \Bigg( Q_{\phi_i}(s,a) - \left(r + \gamma (1 - d) V_{\psi_{\text{targ}}}(s') \right) \Bigg)^2
- \right].
-
-The target value network, like the target networks in DDPG and TD3, is obtained by polyak averaging the value network parameters over the course of training.
-
-**Learning V.** The value function is learned by exploiting (a sample-based approximation of) the connection between :math:`Q^{\pi}` and :math:`V^{\pi}`. Before we go into the learning rule, let's first rewrite the connection equation by using the definition of entropy to obtain:
-
-.. math::
-
- V^{\pi}(s) &= \underE{a \sim \pi}{Q^{\pi}(s,a)} + \alpha H\left(\pi(\cdot|s)\right) \\
- &= \underE{a \sim \pi}{Q^{\pi}(s,a) - \alpha \log \pi(a|s)}.
-
-The RHS is an expectation over actions, so we can approximate it by sampling from the policy:
-
-.. math::
-
- V^{\pi}(s) \approx Q^{\pi}(s,\tilde{a}) - \alpha \log \pi(\tilde{a}|s), \;\;\;\;\; \tilde{a} \sim \pi(\cdot|s).
-
-SAC sets up a mean-squared-error loss for :math:`V_{\psi}` based on this approximation. But what Q-value do we use? SAC uses **clipped double-Q** like TD3 for learning the value function, and takes the minimum Q-value between the two approximators. So the SAC loss for value function parameters is:
-
-.. math::
-
- L(\psi, {\mathcal D}) = \underE{s \sim \mathcal{D} \\ \tilde{a} \sim \pi_{\theta}}{\Bigg(V_{\psi}(s) - \left(\min_{i=1,2} Q_{\phi_i}(s,\tilde{a}) - \alpha \log \pi_{\theta}(\tilde{a}|s) \right)\Bigg)^2}.
-
-Importantly, we do **not** use actions from the replay buffer here: these actions are sampled fresh from the current version of the policy.
-
-**Learning the Policy.** The policy should, in each state, act to maximize the expected future return plus expected future entropy. That is, it should maximize :math:`V^{\pi}(s)`, which we expand out (as before) into
-
-.. math::
-
- \underE{a \sim \pi}{Q^{\pi}(s,a) - \alpha \log \pi(a|s)}.
-
-The way we optimize the policy makes use of the **reparameterization trick**, in which a sample from :math:`\pi_{\theta}(\cdot|s)` is drawn by computing a deterministic function of state, policy parameters, and independent noise. To illustrate: following the authors of the SAC paper, we use a squashed Gaussian policy, which means that samples are obtained according to
-
-.. math::
-
- \tilde{a}_{\theta}(s, \xi) = \tanh\left( \mu_{\theta}(s) + \sigma_{\theta}(s) \odot \xi \right), \;\;\;\;\; \xi \sim \mathcal{N}(0, I).
-
-.. admonition:: You Should Know
-
- This policy has two key differences from the policies we use in the other policy optimization algorithms:
-
- **1. The squashing function.** The :math:`\tanh` in the SAC policy ensures that actions are bounded to a finite range. This is absent in the VPG, TRPO, and PPO policies. It also changes the distribution: before the :math:`\tanh` the SAC policy is a factored Gaussian like the other algorithms' policies, but after the :math:`\tanh` it is not. (You can still compute the log-probabilities of actions in closed form, though: see the paper appendix for details.)
-
- **2. The way standard deviations are parameterized.** In VPG, TRPO, and PPO, we represent the log std devs with state-independent parameter vectors. In SAC, we represent the log std devs as outputs from the neural network, meaning that they depend on state in a complex way. SAC with state-independent log std devs, in our experience, did not work. (Can you think of why? Or better yet: run an experiment to verify?)
-
-The reparameterization trick allows us to rewrite the expectation over actions (which contains a pain point: the distribution depends on the policy parameters) into an expectation over noise (which removes the pain point: the distribution now has no dependence on parameters):
-
-.. math::
-
- \underE{a \sim \pi_{\theta}}{Q^{\pi_{\theta}}(s,a) - \alpha \log \pi_{\theta}(a|s)} = \underE{\xi \sim \mathcal{N}}{Q^{\pi_{\theta}}(s,\tilde{a}_{\theta}(s,\xi)) - \alpha \log \pi_{\theta}(\tilde{a}_{\theta}(s,\xi)|s)}
-
-To get the policy loss, the final step is that we need to substitute :math:`Q^{\pi_{\theta}}` with one of our function approximators. The same as in TD3, we use :math:`Q_{\phi_1}`. The policy is thus optimized according to
-
-.. math::
-
- \max_{\theta} \underE{s \sim \mathcal{D} \\ \xi \sim \mathcal{N}}{Q_{\phi_1}(s,\tilde{a}_{\theta}(s,\xi)) - \alpha \log \pi_{\theta}(\tilde{a}_{\theta}(s,\xi)|s)},
-
-which is almost the same as the DDPG and TD3 policy optimization, except for the stochasticity and entropy term.
-
-
-Exploration vs. Exploitation
-----------------------------
-
-SAC trains a stochastic policy with entropy regularization, and explores in an on-policy way. The entropy regularization coefficient :math:`\alpha` explicitly controls the explore-exploit tradeoff, with higher :math:`\alpha` corresponding to more exploration, and lower :math:`\alpha` corresponding to more exploitation. The right coefficient (the one which leads to the stablest / highest-reward learning) may vary from environment to environment, and could require careful tuning.
-
-At test time, to see how well the policy exploits what it has learned, we remove stochasticity and use the mean action instead of a sample from the distribution. This tends to improve performance over the original stochastic policy.
-
-.. admonition:: You Should Know
-
- Our SAC implementation uses a trick to improve exploration at the start of training. For a fixed number of steps at the beginning (set with the ``start_steps`` keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal SAC exploration.
-
-
-Pseudocode
-----------
-
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{Soft Actor-Critic}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta$, Q-function parameters $\phi_1$, $\phi_2$, V-function parameters $\psi$, empty replay buffer $\mathcal{D}$
- \STATE Set target parameters equal to main parameters $\psi_{\text{targ}} \leftarrow \psi$
- \REPEAT
- \STATE Observe state $s$ and select action $a \sim \pi_{\theta}(\cdot|s)$
- \STATE Execute $a$ in the environment
- \STATE Observe next state $s'$, reward $r$, and done signal $d$ to indicate whether $s'$ is terminal
- \STATE Store $(s,a,r,s',d)$ in replay buffer $\mathcal{D}$
- \STATE If $s'$ is terminal, reset environment state.
- \IF{it's time to update}
- \FOR{$j$ in range(however many updates)}
- \STATE Randomly sample a batch of transitions, $B = \{ (s,a,r,s',d) \}$ from $\mathcal{D}$
- \STATE Compute targets for Q and V functions:
- \begin{align*}
- y_q (r,s',d) &= r + \gamma (1-d) V_{\psi_{\text{targ}}}(s') &&\\
- y_v (s) &= \min_{i=1,2} Q_{\phi_i} (s, \tilde{a}) - \alpha \log \pi_{\theta}(\tilde{a}|s), && \tilde{a} \sim \pi_{\theta}(\cdot|s)
- \end{align*}
- \STATE Update Q-functions by one step of gradient descent using
- \begin{align*}
- & \nabla_{\phi_i} \frac{1}{|B|}\sum_{(s,a,r,s',d) \in B} \left( Q_{\phi,i}(s,a) - y_q(r,s',d) \right)^2 && \text{for } i=1,2
- \end{align*}
- \STATE Update V-function by one step of gradient descent using
- \begin{equation*}
- \nabla_{\psi} \frac{1}{|B|}\sum_{s \in B} \left( V_{\psi}(s) - y_v(s) \right)^2
- \end{equation*}
- \STATE Update policy by one step of gradient ascent using
- \begin{equation*}
- \nabla_{\theta} \frac{1}{|B|}\sum_{s \in B} \Big( Q_{\phi,1}(s, \tilde{a}_{\theta}(s)) - \alpha \log \pi_{\theta} \left(\left. \tilde{a}_{\theta}(s) \right| s\right) \Big),
- \end{equation*}
- where $\tilde{a}_{\theta}(s)$ is a sample from $\pi_{\theta}(\cdot|s)$ which is differentiable wrt $\theta$ via the reparametrization trick.
- \STATE Update target value network with
- \begin{align*}
- \psi_{\text{targ}} &\leftarrow \rho \psi_{\text{targ}} + (1-\rho) \psi
- \end{align*}
- \ENDFOR
- \ENDIF
- \UNTIL{convergence}
- \end{algorithmic}
- \end{algorithm}
-
-Documentation
-=============
-
-.. autofunction:: spinup.sac
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``a`` Tensorflow placeholder for action input.
-``mu`` Deterministically computes mean action from the agent, given states in ``x``.
-``pi`` Samples an action from the agent, conditioned on states in ``x``.
-``q1`` Gives one action-value estimate for states in ``x`` and actions in ``a``.
-``q2`` Gives the other action-value estimate for states in ``x`` and actions in ``a``.
-``v`` Gives the value estimate for states in ``x``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-Note: for SAC, the correct evaluation policy is given by ``mu`` and not by ``pi``. The policy ``pi`` may be thought of as the exploration policy, while ``mu`` is the exploitation policy.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor`_, Haarnoja et al, 2018
-
-.. _`Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor`: https://arxiv.org/abs/1801.01290
-
-
-Other Public Implementations
-----------------------------
-
-- `SAC release repo`_
-
-.. _`SAC release repo`: https://github.com/haarnoja/sac
\ No newline at end of file
diff --git a/docs/_build/html/_sources/algorithms/td3.rst.txt b/docs/_build/html/_sources/algorithms/td3.rst.txt
deleted file mode 100644
index 677a4aada..000000000
--- a/docs/_build/html/_sources/algorithms/td3.rst.txt
+++ /dev/null
@@ -1,186 +0,0 @@
-=================
-Twin Delayed DDPG
-=================
-
-.. contents:: Table of Contents
-
-Background
-==========
-
-(Previously: `Background for DDPG`_)
-
-.. _`Background for DDPG`: ../algorithms/ddpg.html#background
-
-While DDPG can achieve great performance sometimes, it is frequently brittle with respect to hyperparameters and other kinds of tuning. A common failure mode for DDPG is that the learned Q-function begins to dramatically overestimate Q-values, which then leads to the policy breaking, because it exploits the errors in the Q-function. Twin Delayed DDPG (TD3) is an algorithm which addresses this issue by introducing three critical tricks:
-
-**Trick One: Clipped Double-Q Learning.** TD3 learns *two* Q-functions instead of one (hence "twin"), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.
-
-**Trick Two: "Delayed" Policy Updates.** TD3 updates the policy (and target networks) less frequently than the Q-function. The paper recommends one policy update for every two Q-function updates.
-
-**Trick Three: Target Policy Smoothing.** TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action.
-
-Together, these three tricks result in substantially improved performance over baseline DDPG.
-
-Quick Facts
------------
-
-* TD3 is an off-policy algorithm.
-* TD3 can only be used for environments with continuous action spaces.
-* The Spinning Up implementation of TD3 does not support parallelization.
-
-Key Equations
--------------
-
-TD3 concurrently learns two Q-functions, :math:`Q_{\phi_1}` and :math:`Q_{\phi_2}`, by mean square Bellman error minimization, in almost the same way that DDPG learns its single Q-function. To show exactly how TD3 does this and how it differs from normal DDPG, we'll work from the innermost part of the loss function outwards.
-
-First: **target policy smoothing**. Actions used to form the Q-learning target are based on the target policy, :math:`\mu_{\theta_{\text{targ}}}`, but with clipped noise added on each dimension of the action. After adding the clipped noise, the target action is then clipped to lie in the valid action range (all valid actions, :math:`a`, satisfy :math:`a_{Low} \leq a \leq a_{High}`). The target actions are thus:
-
-.. math::
-
- a'(s') = \text{clip}\left(\mu_{\theta_{\text{targ}}}(s') + \text{clip}(\epsilon,-c,c), a_{Low}, a_{High}\right), \;\;\;\;\; \epsilon \sim \mathcal{N}(0, \sigma)
-
-Target policy smoothing essentially serves as a regularizer for the algorithm. It addresses a particular failure mode that can happen in DDPG: if the Q-function approximator develops an incorrect sharp peak for some actions, the policy will quickly exploit that peak and then have brittle or incorrect behavior. This can be averted by smoothing out the Q-function over similar actions, which target policy smoothing is designed to do.
-
-Next: **clipped double-Q learning**. Both Q-functions use a single target, calculated using whichever of the two Q-functions gives a smaller target value:
-
-.. math::
-
- y(r,s',d) = r + \gamma (1 - d) \min_{i=1,2} Q_{\phi_{i, \text{targ}}}(s', a'(s')),
-
-and then both are learned by regressing to this target:
-
-.. math::
-
- L(\phi_1, {\mathcal D}) = \underE{(s,a,r,s',d) \sim {\mathcal D}}{
- \Bigg( Q_{\phi_1}(s,a) - y(r,s',d) \Bigg)^2
- },
-
-.. math::
-
- L(\phi_2, {\mathcal D}) = \underE{(s,a,r,s',d) \sim {\mathcal D}}{
- \Bigg( Q_{\phi_2}(s,a) - y(r,s',d) \Bigg)^2
- }.
-
-Using the smaller Q-value for the target, and regressing towards that, helps fend off overestimation in the Q-function.
-
-Lastly: the policy is learned just by maximizing :math:`Q_{\phi_1}`:
-
-.. math::
-
- \max_{\theta} \underset{s \sim {\mathcal D}}{{\mathrm E}}\left[ Q_{\phi_1}(s, \mu_{\theta}(s)) \right],
-
-which is pretty much unchanged from DDPG. However, in TD3, the policy is updated less frequently than the Q-functions are. This helps damp the volatility that normally arises in DDPG because of how a policy update changes the target.
-
-
-Exploration vs. Exploitation
-----------------------------
-
-TD3 trains a deterministic policy in an off-policy way. Because the policy is deterministic, if the agent were to explore on-policy, in the beginning it would probably not try a wide enough variety of actions to find useful learning signals. To make TD3 policies explore better, we add noise to their actions at training time, typically uncorrelated mean-zero Gaussian noise. To facilitate getting higher-quality training data, you may reduce the scale of the noise over the course of training. (We do not do this in our implementation, and keep noise scale fixed throughout.)
-
-At test time, to see how well the policy exploits what it has learned, we do not add noise to the actions.
-
-.. admonition:: You Should Know
-
- Our TD3 implementation uses a trick to improve exploration at the start of training. For a fixed number of steps at the beginning (set with the ``start_steps`` keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal TD3 exploration.
-
-
-Pseudocode
-----------
-
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{Twin Delayed DDPG}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta$, Q-function parameters $\phi_1$, $\phi_2$, empty replay buffer $\mathcal{D}$
- \STATE Set target parameters equal to main parameters $\theta_{\text{targ}} \leftarrow \theta$, $\phi_{\text{targ},1} \leftarrow \phi_1$, $\phi_{\text{targ},2} \leftarrow \phi_2$
- \REPEAT
- \STATE Observe state $s$ and select action $a = \text{clip}(\mu_{\theta}(s) + \epsilon, a_{Low}, a_{High})$, where $\epsilon \sim \mathcal{N}$
- \STATE Execute $a$ in the environment
- \STATE Observe next state $s'$, reward $r$, and done signal $d$ to indicate whether $s'$ is terminal
- \STATE Store $(s,a,r,s',d)$ in replay buffer $\mathcal{D}$
- \STATE If $s'$ is terminal, reset environment state.
- \IF{it's time to update}
- \FOR{$j$ in range(however many updates)}
- \STATE Randomly sample a batch of transitions, $B = \{ (s,a,r,s',d) \}$ from $\mathcal{D}$
- \STATE Compute target actions
- \begin{equation*}
- a'(s') = \text{clip}\left(\mu_{\theta_{\text{targ}}}(s') + \text{clip}(\epsilon,-c,c), a_{Low}, a_{High}\right), \;\;\;\;\; \epsilon \sim \mathcal{N}(0, \sigma)
- \end{equation*}
- \STATE Compute targets
- \begin{equation*}
- y(r,s',d) = r + \gamma (1-d) \min_{i=1,2} Q_{\phi_{\text{targ},i}}(s', a'(s'))
- \end{equation*}
- \STATE Update Q-functions by one step of gradient descent using
- \begin{align*}
- & \nabla_{\phi_i} \frac{1}{|B|}\sum_{(s,a,r,s',d) \in B} \left( Q_{\phi,i}(s,a) - y(r,s',d) \right)^2 && \text{for } i=1,2
- \end{align*}
- \IF{ $j \mod$ \texttt{policy\_delay} $ = 0$}
- \STATE Update policy by one step of gradient ascent using
- \begin{equation*}
- \nabla_{\theta} \frac{1}{|B|}\sum_{s \in B}Q_{\phi,1}(s, \mu_{\theta}(s))
- \end{equation*}
- \STATE Update target networks with
- \begin{align*}
- \phi_{\text{targ},i} &\leftarrow \rho \phi_{\text{targ},i} + (1-\rho) \phi_i && \text{for } i=1,2\\
- \theta_{\text{targ}} &\leftarrow \rho \theta_{\text{targ}} + (1-\rho) \theta
- \end{align*}
- \ENDIF
- \ENDFOR
- \ENDIF
- \UNTIL{convergence}
- \end{algorithmic}
- \end{algorithm}
-
-
-
-
-Documentation
-=============
-
-.. autofunction:: spinup.td3
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``a`` Tensorflow placeholder for action input.
-``pi`` | Deterministically computes an action from the agent, conditioned
- | on states in ``x``.
-``q1`` Gives one action-value estimate for states in ``x`` and actions in ``a``.
-``q2`` Gives the other action-value estimate for states in ``x`` and actions in ``a``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Addressing Function Approximation Error in Actor-Critic Methods`_, Fujimoto et al, 2018
-
-.. _`Addressing Function Approximation Error in Actor-Critic Methods`: https://arxiv.org/abs/1802.09477
-
-
-Other Public Implementations
-----------------------------
-
-- `TD3 release repo`_
-
-.. _`TD3 release repo`: https://github.com/sfujim/TD3
\ No newline at end of file
diff --git a/docs/_build/html/_sources/algorithms/trpo.rst.txt b/docs/_build/html/_sources/algorithms/trpo.rst.txt
deleted file mode 100644
index a04b04fc1..000000000
--- a/docs/_build/html/_sources/algorithms/trpo.rst.txt
+++ /dev/null
@@ -1,211 +0,0 @@
-================================
-Trust Region Policy Optimization
-================================
-
-.. contents:: Table of Contents
-
-
-
-Background
-==========
-
-(Previously: `Background for VPG`_)
-
-.. _`Background for VPG`: ../algorithms/vpg.html#background
-
-TRPO updates policies by taking the largest step possible to improve performance, while satisfying a special constraint on how close the new and old policies are allowed to be. The constraint is expressed in terms of `KL-Divergence`_, a measure of (something like, but not exactly) distance between probability distributions.
-
-This is different from normal policy gradient, which keeps new and old policies close in parameter space. But even seemingly small differences in parameter space can have very large differences in performance---so a single bad step can collapse the policy performance. This makes it dangerous to use large step sizes with vanilla policy gradients, thus hurting its sample efficiency. TRPO nicely avoids this kind of collapse, and tends to quickly and monotonically improve performance.
-
-.. _`KL-Divergence`: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
-
-Quick Facts
------------
-
-* TRPO is an on-policy algorithm.
-* TRPO can be used for environments with either discrete or continuous action spaces.
-* The Spinning Up implementation of TRPO supports parallelization with MPI.
-
-Key Equations
--------------
-
-Let :math:`\pi_{\theta}` denote a policy with parameters :math:`\theta`. The theoretical TRPO update is:
-
-.. math::
-
- \theta_{k+1} = \arg \max_{\theta} \; & {\mathcal L}(\theta_k, \theta) \\
- \text{s.t.} \; & \bar{D}_{KL}(\theta || \theta_k) \leq \delta
-
-where :math:`{\mathcal L}(\theta_k, \theta)` is the *surrogate advantage*, a measure of how policy :math:`\pi_{\theta}` performs relative to the old policy :math:`\pi_{\theta_k}` using data from the old policy:
-
-.. math::
-
- {\mathcal L}(\theta_k, \theta) = \underE{s,a \sim \pi_{\theta_k}}{
- \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)} A^{\pi_{\theta_k}}(s,a)
- },
-
-and :math:`\bar{D}_{KL}(\theta || \theta_k)` is an average KL-divergence between policies across states visited by the old policy:
-
-.. math::
-
- \bar{D}_{KL}(\theta || \theta_k) = \underE{s \sim \pi_{\theta_k}}{
- D_{KL}\left(\pi_{\theta}(\cdot|s) || \pi_{\theta_k} (\cdot|s) \right)
- }.
-
-.. admonition:: You Should Know
-
- The objective and constraint are both zero when :math:`\theta = \theta_k`. Furthermore, the gradient of the constraint with respect to :math:`\theta` is zero when :math:`\theta = \theta_k`. Proving these facts requires some subtle command of the relevant math---it's an exercise worth doing, whenever you feel ready!
-
-
-The theoretical TRPO update isn't the easiest to work with, so TRPO makes some approximations to get an answer quickly. We Taylor expand the objective and constraint to leading order around :math:`\theta_k`:
-
-.. math::
-
- {\mathcal L}(\theta_k, \theta) &\approx g^T (\theta - \theta_k) \\
- \bar{D}_{KL}(\theta || \theta_k) & \approx \frac{1}{2} (\theta - \theta_k)^T H (\theta - \theta_k)
-
-resulting in an approximate optimization problem,
-
-.. math::
-
- \theta_{k+1} = \arg \max_{\theta} \; & g^T (\theta - \theta_k) \\
- \text{s.t.} \; & \frac{1}{2} (\theta - \theta_k)^T H (\theta - \theta_k) \leq \delta.
-
-.. admonition:: You Should Know
-
- By happy coincidence, the gradient :math:`g` of the surrogate advantage function with respect to :math:`\theta`, evaluated at :math:`\theta = \theta_k`, is exactly equal to the policy gradient, :math:`\nabla_{\theta} J(\pi_{\theta})`! Try proving this, if you feel comfortable diving into the math.
-
-This approximate problem can be analytically solved by the methods of Lagrangian duality [1]_, yielding the solution:
-
-.. math::
-
- \theta_{k+1} = \theta_k + \sqrt{\frac{2 \delta}{g^T H^{-1} g}} H^{-1} g.
-
-If we were to stop here, and just use this final result, the algorithm would be exactly calculating the `Natural Policy Gradient`_. A problem is that, due to the approximation errors introduced by the Taylor expansion, this may not satisfy the KL constraint, or actually improve the surrogate advantage. TRPO adds a modification to this update rule: a backtracking line search,
-
-.. math::
-
- \theta_{k+1} = \theta_k + \alpha^j \sqrt{\frac{2 \delta}{g^T H^{-1} g}} H^{-1} g,
-
-where :math:`\alpha \in (0,1)` is the backtracking coefficient, and :math:`j` is the smallest nonnegative integer such that :math:`\pi_{\theta_{k+1}}` satisfies the KL constraint and produces a positive surrogate advantage.
-
-Lastly: computing and storing the matrix inverse, :math:`H^{-1}`, is painfully expensive when dealing with neural network policies with thousands or millions of parameters. TRPO sidesteps the issue by using the `conjugate gradient`_ algorithm to solve :math:`Hx = g` for :math:`x = H^{-1} g`, requiring only a function which can compute the matrix-vector product :math:`Hx` instead of computing and storing the whole matrix :math:`H` directly. This is not too hard to do: we set up a symbolic operation to calculate
-
-.. math::
-
- Hx = \nabla_{\theta} \left( \left(\nabla_{\theta} \bar{D}_{KL}(\theta || \theta_k)\right)^T x \right),
-
-which gives us the correct output without computing the whole matrix.
-
-.. [1] See `Convex Optimization`_ by Boyd and Vandenberghe, especially chapters 2 through 5.
-
-.. _`Convex Optimization`: http://stanford.edu/~boyd/cvxbook/
-.. _`Natural Policy Gradient`: https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf
-.. _`conjugate gradient`: https://en.wikipedia.org/wiki/Conjugate_gradient_method
-
-
-Exploration vs. Exploitation
-----------------------------
-
-TRPO trains a stochastic policy in an on-policy way. This means that it explores by sampling actions according to the latest version of its stochastic policy. The amount of randomness in action selection depends on both initial conditions and the training procedure. Over the course of training, the policy typically becomes progressively less random, as the update rule encourages it to exploit rewards that it has already found. This may cause the policy to get trapped in local optima.
-
-
-Pseudocode
-----------
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{Trust Region Policy Optimization}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta_0$, initial value function parameters $\phi_0$
- \STATE Hyperparameters: KL-divergence limit $\delta$, backtracking coefficient $\alpha$, maximum number of backtracking steps $K$
- \FOR{$k = 0,1,2,...$}
- \STATE Collect set of trajectories ${\mathcal D}_k = \{\tau_i\}$ by running policy $\pi_k = \pi(\theta_k)$ in the environment.
- \STATE Compute rewards-to-go $\hat{R}_t$.
- \STATE Compute advantage estimates, $\hat{A}_t$ (using any method of advantage estimation) based on the current value function $V_{\phi_k}$.
- \STATE Estimate policy gradient as
- \begin{equation*}
- \hat{g}_k = \frac{1}{|{\mathcal D}_k|} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T \left. \nabla_{\theta} \log\pi_{\theta}(a_t|s_t)\right|_{\theta_k} \hat{A}_t.
- \end{equation*}
- \STATE Use the conjugate gradient algorithm to compute
- \begin{equation*}
- \hat{x}_k \approx \hat{H}_k^{-1} \hat{g}_k,
- \end{equation*}
- where $\hat{H}_k$ is the Hessian of the sample average KL-divergence.
- \STATE Update the policy by backtracking line search with
- \begin{equation*}
- \theta_{k+1} = \theta_k + \alpha^j \sqrt{ \frac{2\delta}{\hat{x}_k^T \hat{H}_k \hat{x}_k}} \hat{x}_k,
- \end{equation*}
- where $j \in \{0, 1, 2, ... K\}$ is the smallest value which improves the sample loss and satisfies the sample KL-divergence constraint.
- \STATE Fit value function by regression on mean-squared error:
- \begin{equation*}
- \phi_{k+1} = \arg \min_{\phi} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T\left( V_{\phi} (s_t) - \hat{R}_t \right)^2,
- \end{equation*}
- typically via some gradient descent algorithm.
- \ENDFOR
- \end{algorithmic}
- \end{algorithm}
-
-
-
-Documentation
-=============
-
-.. autofunction:: spinup.trpo
-
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``pi`` Samples an action from the agent, conditioned on states in ``x``.
-``v`` Gives value estimate for states in ``x``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Trust Region Policy Optimization`_, Schulman et al. 2015
-- `High Dimensional Continuous Control Using Generalized Advantage Estimation`_, Schulman et al. 2016
-- `Approximately Optimal Approximate Reinforcement Learning`_, Kakade and Langford 2002
-
-.. _`Trust Region Policy Optimization`: https://arxiv.org/abs/1502.05477
-.. _`High Dimensional Continuous Control Using Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438
-.. _`Approximately Optimal Approximate Reinforcement Learning`: https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/KakadeLangford-icml2002.pdf
-
-Why These Papers?
------------------
-
-Schulman 2015 is included because it is the original paper describing TRPO. Schulman 2016 is included because our implementation of TRPO makes use of Generalized Advantage Estimation for computing the policy gradient. Kakade and Langford 2002 is included because it contains theoretical results which motivate and deeply connect to the theoretical foundations of TRPO.
-
-
-
-Other Public Implementations
-----------------------------
-
-- Baselines_
-- ModularRL_
-- rllab_
-
-.. _Baselines: https://github.com/openai/baselines/tree/master/baselines/trpo_mpi
-.. _ModularRL: https://github.com/joschu/modular_rl/blob/master/modular_rl/trpo.py
-.. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/trpo.py
\ No newline at end of file
diff --git a/docs/_build/html/_sources/algorithms/vpg.rst.txt b/docs/_build/html/_sources/algorithms/vpg.rst.txt
deleted file mode 100644
index bc4e06a2e..000000000
--- a/docs/_build/html/_sources/algorithms/vpg.rst.txt
+++ /dev/null
@@ -1,140 +0,0 @@
-=======================
-Vanilla Policy Gradient
-=======================
-
-.. contents:: Table of Contents
-
-
-Background
-==========
-
-(Previously: `Introduction to RL, Part 3`_)
-
-.. _`Introduction to RL, Part 3`: ../spinningup/rl_intro3.html
-
-The key idea underlying policy gradients is to push up the probabilities of actions that lead to higher return, and push down the probabilities of actions that lead to lower return, until you arrive at the optimal policy.
-
-Quick Facts
------------
-
-* VPG is an on-policy algorithm.
-* VPG can be used for environments with either discrete or continuous action spaces.
-* The Spinning Up implementation of VPG supports parallelization with MPI.
-
-Key Equations
--------------
-
-Let :math:`\pi_{\theta}` denote a policy with parameters :math:`\theta`, and :math:`J(\pi_{\theta})` denote the expected finite-horizon undiscounted return of the policy. The gradient of :math:`J(\pi_{\theta})` is
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{
- \sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) A^{\pi_{\theta}}(s_t,a_t)
- },
-
-where :math:`\tau` is a trajectory and :math:`A^{\pi_{\theta}}` is the advantage function for the current policy.
-
-The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance:
-
-.. math::
-
- \theta_{k+1} = \theta_k + \alpha \nabla_{\theta} J(\pi_{\theta_k})
-
-Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise using the finite-horizon undiscounted policy gradient formula.
-
-Exploration vs. Exploitation
-----------------------------
-
-VPG trains a stochastic policy in an on-policy way. This means that it explores by sampling actions according to the latest version of its stochastic policy. The amount of randomness in action selection depends on both initial conditions and the training procedure. Over the course of training, the policy typically becomes progressively less random, as the update rule encourages it to exploit rewards that it has already found. This may cause the policy to get trapped in local optima.
-
-
-Pseudocode
-----------
-
-.. math::
- :nowrap:
-
- \begin{algorithm}[H]
- \caption{Vanilla Policy Gradient Algorithm}
- \label{alg1}
- \begin{algorithmic}[1]
- \STATE Input: initial policy parameters $\theta_0$, initial value function parameters $\phi_0$
- \FOR{$k = 0,1,2,...$}
- \STATE Collect set of trajectories ${\mathcal D}_k = \{\tau_i\}$ by running policy $\pi_k = \pi(\theta_k)$ in the environment.
- \STATE Compute rewards-to-go $\hat{R}_t$.
- \STATE Compute advantage estimates, $\hat{A}_t$ (using any method of advantage estimation) based on the current value function $V_{\phi_k}$.
- \STATE Estimate policy gradient as
- \begin{equation*}
- \hat{g}_k = \frac{1}{|{\mathcal D}_k|} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T \left. \nabla_{\theta} \log\pi_{\theta}(a_t|s_t)\right|_{\theta_k} \hat{A}_t.
- \end{equation*}
- \STATE Compute policy update, either using standard gradient ascent,
- \begin{equation*}
- \theta_{k+1} = \theta_k + \alpha_k \hat{g}_k,
- \end{equation*}
- or via another gradient ascent algorithm like Adam.
- \STATE Fit value function by regression on mean-squared error:
- \begin{equation*}
- \phi_{k+1} = \arg \min_{\phi} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T\left( V_{\phi} (s_t) - \hat{R}_t \right)^2,
- \end{equation*}
- typically via some gradient descent algorithm.
- \ENDFOR
- \end{algorithmic}
- \end{algorithm}
-
-
-Documentation
-=============
-
-.. autofunction:: spinup.vpg
-
-Saved Model Contents
---------------------
-
-The computation graph saved by the logger includes:
-
-======== ====================================================================
-Key Value
-======== ====================================================================
-``x`` Tensorflow placeholder for state input.
-``pi`` Samples an action from the agent, conditioned on states in ``x``.
-``v`` Gives value estimate for states in ``x``.
-======== ====================================================================
-
-This saved model can be accessed either by
-
-* running the trained policy with the `test_policy.py`_ tool,
-* or loading the whole saved graph into a program with `restore_tf_graph`_.
-
-.. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
-
-References
-==========
-
-Relevant Papers
----------------
-
-- `Policy Gradient Methods for Reinforcement Learning with Function Approximation`_, Sutton et al. 2000
-- `Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`_, Schulman 2016(a)
-- `Benchmarking Deep Reinforcement Learning for Continuous Control`_, Duan et al. 2016
-- `High Dimensional Continuous Control Using Generalized Advantage Estimation`_, Schulman et al. 2016(b)
-
-.. _`Policy Gradient Methods for Reinforcement Learning with Function Approximation`: https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf
-.. _`Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`: http://joschu.net/docs/thesis.pdf
-.. _`Benchmarking Deep Reinforcement Learning for Continuous Control`: https://arxiv.org/abs/1604.06778
-.. _`High Dimensional Continuous Control Using Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438
-
-Why These Papers?
------------------
-
-Sutton 2000 is included because it is a timeless classic of reinforcement learning theory, and contains references to the earlier work which led to modern policy gradients. Schulman 2016(a) is included because Chapter 2 contains a lucid introduction to the theory of policy gradient algorithms, including pseudocode. Duan 2016 is a clear, recent benchmark paper that shows how vanilla policy gradient in the deep RL setting (eg with neural network policies and Adam as the optimizer) compares with other deep RL algorithms. Schulman 2016(b) is included because our implementation of VPG makes use of Generalized Advantage Estimation for computing the policy gradient.
-
-
-Other Public Implementations
-----------------------------
-
-- rllab_
-- `rllib (Ray)`_
-
-.. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/vpg.py
-.. _`rllib (Ray)`: https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg
diff --git a/docs/_build/html/_sources/etc/acknowledgements.rst.txt b/docs/_build/html/_sources/etc/acknowledgements.rst.txt
deleted file mode 100644
index 17a0639f5..000000000
--- a/docs/_build/html/_sources/etc/acknowledgements.rst.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-================
-Acknowledgements
-================
-
-We gratefully acknowledge the contributions of the many people who helped get this project off of the ground, including people who beta tested the software, gave feedback on the material, improved dependencies of Spinning Up code in service of this release, or otherwise supported the project. Given the number of people who were involved at various points, this list of names may not be exhaustive. (If you think you should have been listed here, please do not hesitate to reach out.)
-
-In no particular order, thank you Alex Ray, Amanda Askell, Ben Garfinkel, Christy Dennison, Coline Devin, Daniel Zeigler, Dylan Hadfield-Menell, Ge Yang, Greg Khan, Jack Clark, Jonas Rothfuss, Larissa Schiavo, Leandro Castelao, Lilian Weng, Maddie Hall, Matthias Plappert, Miles Brundage, Peter Zokhov, and Pieter Abbeel.
-
-We are also grateful to Pieter Abbeel's group at Berkeley, and the Center for Human-Compatible AI, for giving feedback on presentations about Spinning Up.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/etc/author.rst.txt b/docs/_build/html/_sources/etc/author.rst.txt
deleted file mode 100644
index 61f932e8a..000000000
--- a/docs/_build/html/_sources/etc/author.rst.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-================
-About the Author
-================
-
-Spinning Up in Deep RL was primarily developed by Josh Achiam, a research scientist on the OpenAI Safety Team and PhD student at UC Berkeley advised by Pieter Abbeel. Josh studies topics related to safety in deep reinforcement learning, and has previously published work on `safe exploration`_.
-
-.. _`safe exploration`: https://arxiv.org/abs/1705.10528
\ No newline at end of file
diff --git a/docs/_build/html/_sources/index.rst.txt b/docs/_build/html/_sources/index.rst.txt
deleted file mode 100644
index a50c84d8d..000000000
--- a/docs/_build/html/_sources/index.rst.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-.. Spinning Up documentation master file, created by
- sphinx-quickstart on Wed Aug 15 04:21:07 2018.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to Spinning Up in Deep RL!
-==================================
-
-.. image:: images/spinning-up-in-rl.png
-
-.. toctree::
- :maxdepth: 2
- :caption: User Documentation
-
- user/introduction
- user/installation
- user/algorithms
- user/running
- user/saving_and_loading
- user/plotting
-
-.. toctree::
- :maxdepth: 2
- :caption: Introduction to RL
-
- spinningup/rl_intro
- spinningup/rl_intro2
- spinningup/rl_intro3
-
-.. toctree::
- :maxdepth: 2
- :caption: Resources
-
- spinningup/spinningup
- spinningup/keypapers
- spinningup/exercises
- spinningup/bench
-
-.. toctree::
- :maxdepth: 2
- :caption: Algorithms Docs
-
- algorithms/vpg
- algorithms/trpo
- algorithms/ppo
- algorithms/ddpg
- algorithms/td3
- algorithms/sac
-
-.. toctree::
- :maxdepth: 2
- :caption: Utilities Docs
-
- utils/logger
- utils/plotter
- utils/mpi
- utils/run_utils
-
-.. toctree::
- :maxdepth: 2
- :caption: Etc.
-
- etc/acknowledgements
- etc/author
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/_build/html/_sources/spinningup/bench.rst.txt b/docs/_build/html/_sources/spinningup/bench.rst.txt
deleted file mode 100644
index 596b49559..000000000
--- a/docs/_build/html/_sources/spinningup/bench.rst.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-==========================================
-Benchmarks for Spinning Up Implementations
-==========================================
-
-.. contents:: Table of Contents
-
-We benchmarked the Spinning Up algorithm implementations in five environments from the MuJoCo_ Gym task suite: HalfCheetah, Hopper, Walker2d, Swimmer, and Ant.
-
-.. _MuJoCo: https://gym.openai.com/envs/#mujoco
-
-Performance in Each Environment
-===============================
-
-HalfCheetah
------------
-
-.. figure:: ../images/bench/bench_halfcheetah.svg
- :align: center
-
- 3M timestep benchmark for HalfCheetah-2.
-
-
-Hopper
-------
-
-.. figure:: ../images/bench/bench_hopper.svg
- :align: center
-
- 3M timestep benchmark for Hopper-v2.
-
-Walker
-------
-
-.. figure:: ../images/bench/bench_walker.svg
- :align: center
-
- 3M timestep benchmark for Walker2d-v2.
-
-Swimmer
--------
-.. figure:: ../images/bench/bench_swim.svg
- :align: center
-
- 3M timestep benchmark for Swimmer-v2.
-
-Ant
----
-.. figure:: ../images/bench/bench_ant.svg
- :align: center
-
- 3M timestep benchmark for Ant-v2.
-
-Experiment Details
-==================
-
-**Random seeds.** The on-policy algorithms (VPG, TPRO, PPO) were run for 3 random seeds each, and the off-policy algorithms (DDPG, TD3, SAC) were run for 10 random seeds each. Graphs show the average (solid line) and std dev (shaded) of performance over random seed over the course of training.
-
-**Performance metric.** Performance for the on-policy algorithms is measured as the average trajectory return across the batch collected at each epoch. Performance for the off-policy algorithms is measured once every 10,000 steps by running the deterministic policy (or, in the case of SAC, the mean policy) without action noise for ten trajectories, and reporting the average return over those test trajectories.
-
-**Network architectures.** The on-policy algorithms use networks of size (64, 32) with tanh units for both the policy and the value function. The off-policy algorithms use networks of size (400, 300) with relu units.
-
-**Batch size.** The on-policy algorithms collected 4000 steps of agent-environment interaction per batch update. The off-policy algorithms used minibatches of size 100 at each gradient descent step.
-
-All other hyperparameters are left at default settings for the Spinning Up implementations. See algorithm pages for details.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/exercise2_1_soln.rst.txt b/docs/_build/html/_sources/spinningup/exercise2_1_soln.rst.txt
deleted file mode 100644
index ef5ffc38a..000000000
--- a/docs/_build/html/_sources/spinningup/exercise2_1_soln.rst.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-========================
-Solution to Exercise 2.1
-========================
-
-.. figure:: ../images/ex2-1_trpo_hopper.png
- :align: center
-
- Learning curves for TRPO in Hopper-v2 with different values of ``train_v_iters``, averaged over three random seeds.
-
-
-The difference is quite substantial: with a trained value function, the agent is able to quickly make progress. With an untrained value function, the agent gets stuck early on.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/exercise2_2_soln.rst.txt b/docs/_build/html/_sources/spinningup/exercise2_2_soln.rst.txt
deleted file mode 100644
index e194eda6a..000000000
--- a/docs/_build/html/_sources/spinningup/exercise2_2_soln.rst.txt
+++ /dev/null
@@ -1,146 +0,0 @@
-========================
-Solution to Exercise 2.2
-========================
-
-.. figure:: ../images/ex2-2_ddpg_bug.svg
- :align: center
-
- Learning curves for DDPG in HalfCheetah-v2 for bugged and non-bugged actor-critic implementations, averaged over three random seeds.
-
-
-The Bug in the Code
-===================
-
-The only difference between the correct actor-critic code,
-
-.. code-block:: python
- :emphasize-lines: 11, 13
-
- """
- Actor-Critic
- """
- def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu,
- output_activation=tf.tanh, action_space=None):
- act_dim = a.shape.as_list()[-1]
- act_limit = action_space.high[0]
- with tf.variable_scope('pi'):
- pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
- with tf.variable_scope('q'):
- q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
- with tf.variable_scope('q', reuse=True):
- q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
- return pi, q, q_pi
-
-and the bugged actor-critic code,
-
-.. code-block:: python
- :emphasize-lines: 11, 13
-
- """
- Bugged Actor-Critic
- """
- def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu,
- output_activation=tf.tanh, action_space=None):
- act_dim = a.shape.as_list()[-1]
- act_limit = action_space.high[0]
- with tf.variable_scope('pi'):
- pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
- with tf.variable_scope('q'):
- q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None)
- with tf.variable_scope('q', reuse=True):
- q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None)
- return pi, q, q_pi
-
-is the tensor shape for the Q-functions. The correct version squeezes ouputs so that they have shape ``[batch size]``, whereas the bugged version doesn't, resulting in Q-functions with shape ``[batch size, 1]``.
-
-
-How it Gums Up the Works
-========================
-
-Consider the excerpt from the part in the code that builds the DDPG computation graph:
-
-.. code-block:: python
-
- # Bellman backup for Q function
- backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
-
- # DDPG losses
- pi_loss = -tf.reduce_mean(q_pi)
- q_loss = tf.reduce_mean((q-backup)**2)
-
-This is where the tensor shape issue comes into play. It's important to know that ``r_ph`` and ``d_ph`` have shape ``[batch size]``.
-
-The line that produces the Bellman backup was written with the assumption that it would add together tensors with the same shape. However, this line can **also** add together tensors with different shapes, as long as they're broadcast-compatible.
-
-Tensors with shapes ``[batch size]`` and ``[batch size, 1]`` are broadcast compatible, but the behavior is not actually what you might expect! Check out this example:
-
->>> import tensorflow as tf
->>> import numpy as np
->>> x = tf.constant(np.arange(5))
->>> y = tf.constant(np.arange(5).reshape(-1,1))
->>> z1 = x * y
->>> z2 = x + y
->>> z3 = x + z1
->>> x.shape
-TensorShape([Dimension(5)])
->>> y.shape
-TensorShape([Dimension(5), Dimension(1)])
->>> z1.shape
-TensorShape([Dimension(5), Dimension(5)])
->>> z2.shape
-TensorShape([Dimension(5), Dimension(5)])
->>> sess = tf.InteractiveSession()
->>> sess.run(z1)
-array([[ 0, 0, 0, 0, 0],
- [ 0, 1, 2, 3, 4],
- [ 0, 2, 4, 6, 8],
- [ 0, 3, 6, 9, 12],
- [ 0, 4, 8, 12, 16]])
->>> sess.run(z2)
-array([[0, 1, 2, 3, 4],
- [1, 2, 3, 4, 5],
- [2, 3, 4, 5, 6],
- [3, 4, 5, 6, 7],
- [4, 5, 6, 7, 8]])
->>> sess.run(z3)
-array([[ 0, 1, 2, 3, 4],
- [ 0, 2, 4, 6, 8],
- [ 0, 3, 6, 9, 12],
- [ 0, 4, 8, 12, 16],
- [ 0, 5, 10, 15, 20]])
-
-Adding or multiplying a shape ``[5]`` tensor by a shape ``[5,1]`` tensor returns a shape ``[5,5]`` tensor!
-
-When you don't squeeze the Q-functions, ``q_pi_targ`` has shape ``[batch size, 1]``, and the backup---and in turn, the whole Q-loss---gets totally messed up.
-
-Broadcast error 1: ``(1 - d_ph) * q_pi_targ`` becomes a ``[batch size, batch size]`` tensor containing the outer product of the mask with the target network Q-values.
-
-Broadcast error 2: ``r_ph`` then gets treated as a row vector and added to each row of ``(1 - d_ph) * q_pi_targ`` separately.
-
-Broadcast error 3: ``q_loss`` depends on ``q - backup``, which involves another bad broadcast between ``q`` (shape ``[batch size, 1]``) and ``backup`` (shape ``[batch size, batch size]``).
-
-To put it mathematically: let :math:`q`, :math:`q'`, :math:`r`, :math:`d` denote vectors containing the q-values, target q-values, rewards, and dones for a given batch, where there are :math:`n` entries in the batch. The correct backup is
-
-.. math::
-
- z_i = r_i + \gamma (1-d_i) q'_i,
-
-and the correct loss function is
-
-.. math::
-
- \frac{1}{n} \sum_{i=1}^n (q_i - z_i)^2.
-
-But with these errors, what gets computed is a backup *matrix*,
-
-.. math::
-
- z_{ij} = r_j + \gamma (1-d_j) q'_i,
-
-and a messed up loss function
-
-.. math::
-
- \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n (q_j - z_{ij})^2.
-
-If you leave this to run in HalfCheetah long enough, you'll actually see some non-trivial learning process, because weird details specific to this environment partly cancel out the errors. But almost everywhere else, it fails completely.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/exercises.rst.txt b/docs/_build/html/_sources/spinningup/exercises.rst.txt
deleted file mode 100644
index 937391805..000000000
--- a/docs/_build/html/_sources/spinningup/exercises.rst.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-=========
-Exercises
-=========
-
-
-.. contents:: Table of Contents
- :depth: 2
-
-Problem Set 1: Basics of Implementation
----------------------------------------
-
-.. admonition:: Exercise 1.1: Gaussian Log-Likelihood
-
- **Path to Exercise.** ``spinup/exercises/problem_set_1/exercise1_1.py``
-
- **Path to Solution.** ``spinup/exercises/problem_set_1_solutions/exercise1_1_soln.py``
-
- **Instructions.** Write a function which takes in Tensorflow symbols for the means and log stds of a batch of diagonal Gaussian distributions, along with a Tensorflow placeholder for (previously-generated) samples from those distributions, and returns a Tensorflow symbol for computing the log likelihoods of those samples.
-
- You may find it useful to review the formula given in `this section of the RL introduction`_.
-
- Implement your solution in ``exercise1_1.py``, and run that file to automatically check your work.
-
- **Evaluation Criteria.** Your solution will be checked by comparing outputs against a known-good implementation, using a batch of random inputs.
-
-.. _`this section of the RL introduction`: ../spinningup/rl_intro.html#stochastic-policies
-
-
-.. admonition:: Exercise 1.2: Policy for PPO
-
- **Path to Exercise.** ``spinup/exercises/problem_set_1/exercise1_2.py``
-
- **Path to Solution.** ``spinup/exercises/problem_set_1_solutions/exercise1_2_soln.py``
-
- **Instructions.** Implement an MLP diagonal Gaussian policy for PPO.
-
- Implement your solution in ``exercise1_2.py``, and run that file to automatically check your work.
-
- **Evaluation Criteria.** Your solution will be evaluated by running for 20 epochs in the InvertedPendulum-v2 Gym environment, and this should take in the ballpark of 3-5 minutes (depending on your machine, and other processes you are running in the background). The bar for success is reaching an average score of over 500 in the last 5 epochs, or getting to a score of 1000 (the maximum possible score) in the last 5 epochs.
-
-
-.. admonition:: Exercise 1.3: Computation Graph for TD3
-
- **Path to Exercise.** ``spinup/exercises/problem_set_1/exercise1_3.py``
-
- **Path to Solution.** ``spinup/algos/td3/td3.py``
-
- **Instructions.** Implement the core computation graph for the TD3 algorithm.
-
- As starter code, you are given the entirety of the TD3 algorithm except for the computation graph. Find "YOUR CODE HERE" to begin.
-
- You may find it useful to review the pseudocode in our `page on TD3`_.
-
- Implement your solution in ``exercise1_3.py``, and run that file to see the results of your work. There is no automatic checking for this exercise.
-
- **Evaluation Criteria.** Evaluate your code by running ``exercise1_3.py`` with HalfCheetah-v2, InvertedPendulum-v2, and one other Gym MuJoCo environment of your choosing (set via the ``--env`` flag). It is set up to use smaller neural networks (hidden sizes [128,128]) than typical for TD3, with a maximum episode length of 150, and to run for only 10 epochs. The goal is to see significant learning progress relatively quickly (in terms of wall clock time). Experiments will likely take on the order of ~10 minutes.
-
- Use the ``--use_soln`` flag to run Spinning Up's TD3 instead of your implementation. Anecdotally, within 10 epochs, the score in HalfCheetah should go over 300, and the score in InvertedPendulum should max out at 150.
-
-.. _`page on TD3`: ../algorithms/td3.html
-
-
-Problem Set 2: Algorithm Failure Modes
---------------------------------------
-
-.. admonition:: Exercise 2.1: Value Function Fitting in TRPO
-
- **Path to Exercise.** (Not applicable, there is no code for this one.)
-
- **Path to Solution.** `Solution available here. <../spinningup/exercise2_1_soln.html>`_
-
- Many factors can impact the performance of policy gradient algorithms, but few more drastically than the quality of the learned value function used for advantage estimation.
-
- In this exercise, you will compare results between runs of TRPO where you put lots of effort into fitting the value function (``train_v_iters=80``), versus where you put very little effort into fitting the value function (``train_v_iters=0``).
-
- **Instructions.** Run the following command:
-
- .. parsed-literal::
-
- python -m spinup.run trpo --env Hopper-v2 --train_v_iters[v] 0 80 --exp_name ex2-1 --epochs 250 --steps_per_epoch 4000 --seed 0 10 20 --dt
-
- and plot the results. (These experiments might take ~10 minutes each, and this command runs six of them.) What do you find?
-
-.. admonition:: Exercise 2.2: Silent Bug in DDPG
-
- **Path to Exercise.** ``spinup/exercises/problem_set_2/exercise2_2.py``
-
- **Path to Solution.** `Solution available here. <../spinningup/exercise2_2_soln.html>`_
-
- The hardest part of writing RL code is dealing with bugs, because failures are frequently silent. The code will appear to run correctly, but the agent's performance will degrade relative to a bug-free implementation---sometimes to the extent that it never learns anything.
-
- In this exercise, you will observe a bug in vivo and compare results against correct code.
-
- **Instructions.** Run ``exercise2_2.py``, which will launch DDPG experiments with and without a bug. The non-bugged version runs the default Spinning Up implementation of DDPG, using a default method for creating the actor and critic networks. The bugged version runs the same DDPG code, except uses a bugged method for creating the networks.
-
- There will be six experiments in all (three random seeds for each case), and each should take in the ballpark of 10 minutes. When they're finished, plot the results. What is the difference in performance with and without the bug?
-
- Without referencing the correct actor-critic code (which is to say---don't look in DDPG's ``core.py`` file), try to figure out what the bug is and explain how it breaks things.
-
- **Hint.** To figure out what's going wrong, think about how the DDPG code implements the DDPG computation graph. Specifically, look at this excerpt:
-
- .. code-block:: python
-
- # Bellman backup for Q function
- backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
-
- # DDPG losses
- pi_loss = -tf.reduce_mean(q_pi)
- q_loss = tf.reduce_mean((q-backup)**2)
-
- How could a bug in the actor-critic code have an impact here?
-
- **Bonus.** Are there any choices of hyperparameters which would have hidden the effects of the bug?
-
-
-Challenges
-----------
-
-.. admonition:: Write Code from Scratch
-
- As we suggest in `the essay <../spinningup/spinningup.html#learn-by-doing>`_, try reimplementing various deep RL algorithms from scratch.
-
-.. admonition:: Requests for Research
-
- If you feel comfortable with writing deep learning and deep RL code, consider trying to make progress on any of OpenAI's standing requests for research:
-
- * `Requests for Research 1 `_
- * `Requests for Research 2 `_
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/extra_pg_proof1.rst.txt b/docs/_build/html/_sources/spinningup/extra_pg_proof1.rst.txt
deleted file mode 100644
index 24f3f0355..000000000
--- a/docs/_build/html/_sources/spinningup/extra_pg_proof1.rst.txt
+++ /dev/null
@@ -1,117 +0,0 @@
-==============
-Extra Material
-==============
-
-Proof for Don't Let the Past Distract You
-=========================================
-
-In this subsection, we will prove that actions should not be reinforced for rewards obtained in the past.
-
-Expand out :math:`R(\tau)` in the expression for the `simplest policy gradient`_ to obtain:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)} \\
- &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=0}^T R(s_{t'}, a_{t'}, s_{t'+1})} \\
- &= \sum_{t=0}^{T} \sum_{t'=0}^T \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})},
-
-and consider the term
-
-.. math::
-
- \underE{\tau \sim \pi_{\theta}}{f(t,t')} = \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})}.
-
-We will show that for the case of :math:`t' < t` (the reward comes before the action being reinforced), this term is zero. This is a complete proof of the original claim, because after dropping terms with :math:`t' < t` from the expression, we are left with the reward-to-go form of the policy gradient, as desired:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1})}
-
-**1. Using the Marginal Distribution.** To proceed, we have to break down the expectation in :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')}`. It's an expectation over trajectories, but the expression inside the expectation only deals with a few states and actions: :math:`s_t`, :math:`a_t`, :math:`s_{t'}`, :math:`a_{t'}`, and :math:`s_{t'+1}`. So in computing the expectation, we only need to worry about the `marginal distribution`_ over these random variables.
-
-We derive:
-
-.. math::
-
- \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \int_{\tau} P(\tau|\pi_{\theta}) f(t,t') \\
- &= \int_{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}} P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) f(t,t') \\
- &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')}.
-
-**2. Probability Chain Rule.** Joint distributions can be calculated in terms of conditional and marginal probabilities via `chain rule of probability`_: :math:`P(A,B) = P(B|A) P(A)`. Here, we use this rule to compute
-
-.. math::
-
- P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) = P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) P(s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta})
-
-
-**3. Separating Expectations Over Multiple Random Variables.** If we have an expectation over two random variables :math:`A` and :math:`B`, we can split it into an inner and outer expectation, where the inner expectation treats the variable from the outer expectation as a constant. Our ability to make this split relies on probability chain rule. Mathematically:
-
-.. math::
-
- \underE{A,B}{f(A,B)} &= \int_{A,B} P(A,B) f(A,B) \\
- &= \int_{A} \int_B P(B|A) P(A) f(A,B) \\
- &= \int_A P(A) \int_B P(B|A) f(A,B) \\
- &= \int_A P(A) \underE{B}{f(A,B) \Big| A} \\
- &= \underE{A}{\underE{B}{f(A,B) \Big| A} }
-
-An expectation over :math:`s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}` can thus be expressed by
-
-.. math::
-
- \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')} \\
- &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}}
-
-**4. Constants Can Be Pulled Outside of Expectations.** If a term inside an expectation is constant with respect to the variable being expected over, it can be pulled outside of the expectation. To give an example, consider again an expectation over two random variables :math:`A` and :math:`B`, where this time, :math:`f(A,B) = h(A) g(B)`. Then, using the result from before:
-
-.. math::
-
- \underE{A,B}{f(A,B)} &= \underE{A}{\underE{B}{f(A,B) \Big| A}} \\
- &= \underE{A}{\underE{B}{h(A) g(B) \Big| A}}\\
- &= \underE{A}{h(A) \underE{B}{g(B) \Big| A}}.
-
-The function in our expectation decomposes this way, allowing us to write:
-
-.. math::
-
- \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\
- &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1}) \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\
- &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{R(s_{t'}, a_{t'}, s_{t'+1}) \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}}}.
-
-**5. Applying the EGLP Lemma.** The last step in our proof relies on the `EGLP lemma`_. At this point, we will only worry about the innermost expectation,
-
-.. math::
-
- \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} = \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t).
-
-We now have to make a distinction between two cases: :math:`t' < t`, the case where the reward happened before the action, and :math:`t' \geq t`, where it didn't.
-
-**Case One: Reward Before Action.** If :math:`t' < t`, then the conditional probabilities for actions at :math:`a_t` come from the policy:
-
-.. math::
-
- P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) &= \pi_{\theta}(a_t | s_t) P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}),
-
-the innermost expectation can be broken down farther into
-
-.. math::
-
- \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} &= \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\
- &= \int_{s_t} P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \int_{a_t} \pi_{\theta}(a_t | s_t) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\
- &= \underE{s_t \sim \pi_{\theta}}{ \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } \Big| s_{t'}, a_{t'}, s_{t'+1}}.
-
-The EGLP lemma says that
-
-.. math::
-
- \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } = 0,
-
-allowing us to conclude that for :math:`t' < t`, :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')} = 0`.
-
-**Case Two: Reward After Action.** What about the :math:`t' \geq t` case, though? Why doesn't the same logic apply? In this case, the conditional probabilities for :math:`a_t` can't be broken down the same way, because you're conditioning **on the future.** Think about it like this: let's say that every day, in the morning, you make a choice between going for a jog and going to work early, and you have a 50-50 chance of each option. If you condition on a future where you went to work early, what are the odds that you went for a jog? Clearly, you didn't. But if you're conditioning on the past---before you made the decision---what are the odds that you will later go for a jog? Now it's back to 50-50.
-
-So in the case where :math:`t' \geq t`, the conditional distribution over actions :math:`a_t` is **not** :math:`\pi(a_t|s_t)`, and the EGLP lemma does not apply.
-
-.. _`simplest policy gradient`: ../spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient
-.. _`marginal distribution`: https://en.wikipedia.org/wiki/Marginal_distribution
-.. _`chain rule of probability`: https://en.wikipedia.org/wiki/Chain_rule_(probability)
-.. _`EGLP lemma`: ../spinningup/rl_intro3.html#expected-grad-log-prob-lemma
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/extra_pg_proof2.rst.txt b/docs/_build/html/_sources/spinningup/extra_pg_proof2.rst.txt
deleted file mode 100644
index defd3076f..000000000
--- a/docs/_build/html/_sources/spinningup/extra_pg_proof2.rst.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-==============
-Extra Material
-==============
-
-Proof for Using Q-Function in Policy Gradient Formula
-=====================================================
-
-In this section, we will show that
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \Big( \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big) Q^{\pi_{\theta}}(s_t, a_t)},
-
-for the finite-horizon undiscounted return setting. (An analagous result holds in the infinite-horizon discounted case using basically the same proof.)
-
-
-The proof of this claim depends on the `law of iterated expectations`_. First, let's rewrite the expression for the policy gradient, starting from the reward-to-go form (using the notation :math:`\hat{R}_t = \sum_{t'=t}^T R(s_t, a_t, s_{t+1})` to help shorten things):
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t} \\
- &= \sum_{t=0}^{T} \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t}
-
-Define :math:`\tau_{:t} = (s_0, a_0, ..., s_t, a_t)` as the trajectory up to time :math:`t`, and :math:`\tau_{t:}` as the remainder of the trajectory after that. By the law of iterated expectations, we can break up the preceding expression into:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t \right| \tau_{:t}}}
-
-The grad-log-prob is constant with respect to the inner expectation (because it depends on :math:`s_t` and :math:`a_t`, which the inner expectation conditions on as fixed in :math:`\tau_{:t}`), so it can be pulled out, leaving:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}}}
-
-In Markov Decision Processes, the future only depends on the most recent state and action. As a result, the inner expectation---which expects over the future, conditioned on the entirety of the past (everything up to time :math:`t`)---is equal to the same expectation if it only conditioned on the last timestep (just :math:`(s_t,a_t)`):
-
-.. math::
-
- \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}} = \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| s_t, a_t},
-
-which is the *definition* of :math:`Q^{\pi_{\theta}}(s_t, a_t)`: the expected return, starting from state :math:`s_t` and action :math:`a_t`, when acting on-policy for the rest of the trajectory.
-
-The result follows immediately.
-
-.. _`law of iterated expectations`: https://en.wikipedia.org/wiki/Law_of_total_expectation
diff --git a/docs/_build/html/_sources/spinningup/keypapers.rst.txt b/docs/_build/html/_sources/spinningup/keypapers.rst.txt
deleted file mode 100644
index d3dcc8fcf..000000000
--- a/docs/_build/html/_sources/spinningup/keypapers.rst.txt
+++ /dev/null
@@ -1,308 +0,0 @@
-=====================
-Key Papers in Deep RL
-=====================
-
-What follows is a list of papers in deep RL that are worth reading. This is *far* from comprehensive, but should provide a useful starting point for someone looking to do research in the field.
-
-.. contents:: Table of Contents
- :depth: 2
-
-
-1. Model-Free RL
-================
-
-a. Deep Q-Learning
-------------------
-
-
-.. [#] `Playing Atari with Deep Reinforcement Learning `_, Mnih et al, 2013. **Algorithm: DQN.**
-
-.. [#] `Deep Recurrent Q-Learning for Partially Observable MDPs `_, Hausknecht and Stone, 2015. **Algorithm: Deep Recurrent Q-Learning.**
-
-.. [#] `Dueling Network Architectures for Deep Reinforcement Learning `_, Wang et al, 2015. **Algorithm: Dueling DQN.**
-
-.. [#] `Deep Reinforcement Learning with Double Q-learning `_, Hasselt et al 2015. **Algorithm: Double DQN.**
-
-.. [#] `Prioritized Experience Replay `_, Schaul et al, 2015. **Algorithm: Prioritized Experience Replay (PER).**
-
-.. [#] `Rainbow: Combining Improvements in Deep Reinforcement Learning `_, Hessel et al, 2017. **Algorithm: Rainbow DQN.**
-
-
-b. Policy Gradients
--------------------
-
-
-.. [#] `Asynchronous Methods for Deep Reinforcement Learning `_, Mnih et al, 2016. **Algorithm: A3C.**
-
-.. [#] `Trust Region Policy Optimization `_, Schulman et al, 2015. **Algorithm: TRPO.**
-
-.. [#] `High-Dimensional Continuous Control Using Generalized Advantage Estimation `_, Schulman et al, 2015. **Algorithm: GAE.**
-
-.. [#] `Proximal Policy Optimization Algorithms `_, Schulman et al, 2017. **Algorithm: PPO-Clip, PPO-Penalty.**
-
-.. [#] `Emergence of Locomotion Behaviours in Rich Environments `_, Heess et al, 2017. **Algorithm: PPO-Penalty.**
-
-.. [#] `Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation `_, Wu et al, 2017. **Algorithm: ACKTR.**
-
-.. [#] `Sample Efficient Actor-Critic with Experience Replay `_, Wang et al, 2016. **Algorithm: ACER.**
-
-.. [#] `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor `_, Haarnoja et al, 2018. **Algorithm: SAC.**
-
-c. Deterministic Policy Gradients
----------------------------------
-
-
-.. [#] `Deterministic Policy Gradient Algorithms `_, Silver et al, 2014. **Algorithm: DPG.**
-
-.. [#] `Continuous Control With Deep Reinforcement Learning `_, Lillicrap et al, 2015. **Algorithm: DDPG.**
-
-.. [#] `Addressing Function Approximation Error in Actor-Critic Methods `_, Fujimoto et al, 2018. **Algorithm: TD3.**
-
-
-d. Distributional RL
---------------------
-
-.. [#] `A Distributional Perspective on Reinforcement Learning `_, Bellemare et al, 2017. **Algorithm: C51.**
-
-.. [#] `Distributional Reinforcement Learning with Quantile Regression `_, Dabney et al, 2017. **Algorithm: QR-DQN.**
-
-.. [#] `Implicit Quantile Networks for Distributional Reinforcement Learning `_, Dabney et al, 2018. **Algorithm: IQN.**
-
-.. [#] `Dopamine: A Research Framework for Deep Reinforcement Learning `_, Anonymous, 2018. **Contribution:** Introduces Dopamine, a code repository containing implementations of DQN, C51, IQN, and Rainbow. `Code link. `_
-
-e. Policy Gradients with Action-Dependent Baselines
----------------------------------------------------
-
-.. [#] `Q-Prop: Sample-Efficient Policy Gradient with An Off-Policy Critic `_, Gu et al, 2016. **Algorithm: Q-Prop.**
-
-.. [#] `Action-depedent Control Variates for Policy Optimization via Stein's Identity `_, Liu et al, 2017. **Algorithm: Stein Control Variates.**
-
-.. [#] `The Mirage of Action-Dependent Baselines in Reinforcement Learning `_, Tucker et al, 2018. **Contribution:** interestingly, critiques and reevaluates claims from earlier papers (including Q-Prop and stein control variates) and finds important methodological errors in them.
-
-
-f. Path-Consistency Learning
-----------------------------
-
-.. [#] `Bridging the Gap Between Value and Policy Based Reinforcement Learning `_, Nachum et al, 2017. **Algorithm: PCL.**
-
-.. [#] `Trust-PCL: An Off-Policy Trust Region Method for Continuous Control `_, Nachum et al, 2017. **Algorithm: Trust-PCL.**
-
-g. Other Directions for Combining Policy-Learning and Q-Learning
-----------------------------------------------------------------
-
-.. [#] `Combining Policy Gradient and Q-learning `_, O'Donoghue et al, 2016. **Algorithm: PGQL.**
-
-.. [#] `The Reactor: A Fast and Sample-Efficient Actor-Critic Agent for Reinforcement Learning `_, Gruslys et al, 2017. **Algorithm: Reactor.**
-
-.. [#] `Interpolated Policy Gradient: Merging On-Policy and Off-Policy Gradient Estimation for Deep Reinforcement Learning `_, Gu et al, 2017. **Algorithm: IPG.**
-
-.. [#] `Equivalence Between Policy Gradients and Soft Q-Learning `_, Schulman et al, 2017. **Contribution:** Reveals a theoretical link between these two families of RL algorithms.
-
-
-h. Evolutionary Algorithms
---------------------------
-
-.. [#] `Evolution Strategies as a Scalable Alternative to Reinforcement Learning `_, Salimans et al, 2017. **Algorithm: ES.**
-
-
-
-2. Exploration
-==============
-
-a. Intrinsic Motivation
------------------------
-
-.. [#] `VIME: Variational Information Maximizing Exploration `_, Houthooft et al, 2016. **Algorithm: VIME.**
-
-.. [#] `Unifying Count-Based Exploration and Intrinsic Motivation `_, Bellemare et al, 2016. **Algorithm: CTS-based Pseudocounts.**
-
-.. [#] `Count-Based Exploration with Neural Density Models `_, Ostrovski et al, 2017. **Algorithm: PixelCNN-based Pseudocounts.**
-
-.. [#] `#Exploration: A Study of Count-Based Exploration for Deep Reinforcement Learning `_, Tang et al, 2016. **Algorithm: Hash-based Counts.**
-
-.. [#] `EX2: Exploration with Exemplar Models for Deep Reinforcement Learning `_, Fu et al, 2017. **Algorithm: EX2.**
-
-.. [#] `Curiosity-driven Exploration by Self-supervised Prediction `_, Pathak et al, 2017. **Algorithm: Intrinsic Curiosity Module (ICM).**
-
-.. [#] `Large-Scale Study of Curiosity-Driven Learning `_, Burda et al, 2018. **Contribution:** Systematic analysis of how surprisal-based intrinsic motivation performs in a wide variety of environments.
-
-.. [#] `Exploration by Random Network Distillation `_, Burda et al, 2018. **Algorithm: RND.**
-
-
-b. Unsupervised RL
-------------------
-
-.. [#] `Variational Intrinsic Control `_, Gregor et al, 2016. **Algorithm: VIC.**
-
-.. [#] `Diversity is All You Need: Learning Skills without a Reward Function `_, Eysenbach et al, 2018. **Algorithm: DIAYN.**
-
-.. [#] `Variational Option Discovery Algorithms `_, Achiam et al, 2018. **Algorithm: VALOR.**
-
-
-3. Transfer and Multitask RL
-============================
-
-.. [#] `Progressive Neural Networks `_, Rusu et al, 2016. **Algorithm: Progressive Networks.**
-
-.. [#] `Universal Value Function Approximators `_, Schaul et al, 2015. **Algorithm: UVFA.**
-
-.. [#] `Reinforcement Learning with Unsupervised Auxiliary Tasks `_, Jaderberg et al, 2016. **Algorithm: UNREAL.**
-
-.. [#] `The Intentional Unintentional Agent: Learning to Solve Many Continuous Control Tasks Simultaneously `_, Cabi et al, 2017. **Algorithm: IU Agent.**
-
-.. [#] `PathNet: Evolution Channels Gradient Descent in Super Neural Networks `_, Fernando et al, 2017. **Algorithm: PathNet.**
-
-.. [#] `Mutual Alignment Transfer Learning `_, Wulfmeier et al, 2017. **Algorithm: MATL.**
-
-.. [#] `Learning an Embedding Space for Transferable Robot Skills `_, Hausman et al, 2018.
-
-.. [#] `Hindsight Experience Replay `_, Andrychowicz et al, 2017. **Algorithm: Hindsight Experience Replay (HER).**
-
-4. Hierarchy
-============
-
-.. [#] `Strategic Attentive Writer for Learning Macro-Actions `_, Vezhnevets et al, 2016. **Algorithm: STRAW.**
-
-.. [#] `FeUdal Networks for Hierarchical Reinforcement Learning `_, Vezhnevets et al, 2017. **Algorithm: Feudal Networks**
-
-.. [#] `Data-Efficient Hierarchical Reinforcement Learning `_, Nachum et al, 2018. **Algorithm: HIRO.**
-
-5. Memory
-=========
-
-.. [#] `Model-Free Episodic Control `_, Blundell et al, 2016. **Algorithm: MFEC.**
-
-
-.. [#] `Neural Episodic Control `_, Pritzel et al, 2017. **Algorithm: NEC.**
-
-.. [#] `Neural Map: Structured Memory for Deep Reinforcement Learning `_, Parisotto and Salakhutdinov, 2017. **Algorithm: Neural Map.**
-
-.. [#] `Unsupervised Predictive Memory in a Goal-Directed Agent `_, Wayne et al, 2018. **Algorithm: MERLIN.**
-
-.. [#] `Relational Recurrent Neural Networks `_, Santoro et al, 2018. **Algorithm: RMC.**
-
-6. Model-Based RL
-=================
-
-a. Model is Learned
--------------------
-
-.. [#] `Imagination-Augmented Agents for Deep Reinforcement Learning `_, Weber et al, 2017. **Algorithm: I2A.**
-
-.. [#] `Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning `_, Nagabandi et al, 2017. **Algorithm: MBMF.**
-
-.. [#] `Model-Based Value Expansion for Efficient Model-Free Reinforcement Learning `_, Feinberg et al, 2018. **Algorithm: MVE.**
-
-.. [#] `Sample-Efficient Reinforcement Learning with Stochastic Ensemble Value Expansion `_, Buckman et al, 2018. **Algorithm: STEVE.**
-
-.. [#] `Model-Ensemble Trust-Region Policy Optimization `_, Kurutach et al, 2018. **Algorithm: ME-TRPO.**
-
-.. [#] `Model-Based Reinforcement Learning via Meta-Policy Optimization `_, Clavera et al, 2018. **Algorithm: MB-MPO.**
-
-.. [#] `Recurrent World Models Facilitate Policy Evolution `_, Ha and Schmidhuber, 2018.
-
-b. Model is Given
------------------
-
-.. [#] `Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm `_, Silver et al, 2017. **Algorithm: AlphaZero.**
-
-.. [#] `Thinking Fast and Slow with Deep Learning and Tree Search `_, Anthony et al, 2017. **Algorithm: ExIt.**
-
-7. Meta-RL
-==========
-
-.. [#] `RL^2: Fast Reinforcement Learning via Slow Reinforcement Learning `_, Duan et al, 2016. **Algorithm: RL^2.**
-
-.. [#] `Learning to Reinforcement Learn `_, Wang et al, 2016.
-
-.. [#] `Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks `_, Finn et al, 2017. **Algorithm: MAML.**
-
-.. [#] `A Simple Neural Attentive Meta-Learner `_, Mishra et al, 2018. **Algorithm: SNAIL.**
-
-8. Scaling RL
-=============
-
-.. [#] `Accelerated Methods for Deep Reinforcement Learning `_, Stooke and Abbeel, 2018. **Contribution:** Systematic analysis of parallelization in deep RL across algorithms.
-
-.. [#] `IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures `_, Espeholt et al, 2018. **Algorithm: IMPALA.**
-
-.. [#] `Distributed Prioritized Experience Replay `_, Horgan et al, 2018. **Algorithm: Ape-X.**
-
-.. [#] `Recurrent Experience Replay in Distributed Reinforcement Learning `_, Anonymous, 2018. **Algorithm: R2D2.**
-
-.. [#] `RLlib: Abstractions for Distributed Reinforcement Learning `_, Liang et al, 2017. **Contribution:** A scalable library of RL algorithm implementations. `Documentation link. `_
-
-
-9. RL in the Real World
-=======================
-
-.. [#] `Benchmarking Reinforcement Learning Algorithms on Real-World Robots `_, Mahmood et al, 2018.
-
-.. [#] `Learning Dexterous In-Hand Manipulation `_, OpenAI, 2018.
-
-.. [#] `QT-Opt: Scalable Deep Reinforcement Learning for Vision-Based Robotic Manipulation `_, Kalashnikov et al, 2018. **Algorithm: QT-Opt.**
-
-.. [#] `Horizon: Facebook's Open Source Applied Reinforcement Learning Platform `_, Gauci et al, 2018.
-
-
-10. Safety
-==========
-
-.. [#] `Concrete Problems in AI Safety `_, Amodei et al, 2016. **Contribution:** establishes a taxonomy of safety problems, serving as an important jumping-off point for future research. We need to solve these!
-
-.. [#] `Deep Reinforcement Learning From Human Preferences `_, Christiano et al, 2017. **Algorithm: LFP.**
-
-.. [#] `Constrained Policy Optimization `_, Achiam et al, 2017. **Algorithm: CPO.**
-
-.. [#] `Safe Exploration in Continuous Action Spaces `_, Dalal et al, 2018. **Algorithm: DDPG+Safety Layer.**
-
-.. [#] `Trial without Error: Towards Safe Reinforcement Learning via Human Intervention `_, Saunders et al, 2017. **Algorithm: HIRL.**
-
-.. [#] `Leave No Trace: Learning to Reset for Safe and Autonomous Reinforcement Learning `_, Eysenbach et al, 2017. **Algorithm: Leave No Trace.**
-
-
-11. Imitation Learning and Inverse Reinforcement Learning
-=========================================================
-
-.. [#] `Modeling Purposeful Adaptive Behavior with the Principle of Maximum Causal Entropy `_, Ziebart 2010. **Contributions:** Crisp formulation of maximum entropy IRL.
-
-.. [#] `Guided Cost Learning: Deep Inverse Optimal Control via Policy Optimization `_, Finn et al, 2016. **Algorithm: GCL.**
-
-.. [#] `Generative Adversarial Imitation Learning `_, Ho and Ermon, 2016. **Algorithm: GAIL.**
-
-.. [#] `DeepMimic: Example-Guided Deep Reinforcement Learning of Physics-Based Character Skills `_, Peng et al, 2018. **Algorithm: DeepMimic.**
-
-.. [#] `Variational Discriminator Bottleneck: Improving Imitation Learning, Inverse RL, and GANs by Constraining Information Flow `_, Peng et al, 2018. **Algorithm: VAIL.**
-
-.. [#] `One-Shot High-Fidelity Imitation: Training Large-Scale Deep Nets with RL `_, Le Paine et al, 2018. **Algorithm: MetaMimic.**
-
-
-12. Reproducibility, Analysis, and Critique
-===========================================
-
-.. [#] `Benchmarking Deep Reinforcement Learning for Continuous Control `_, Duan et al, 2016. **Contribution: rllab.**
-
-.. [#] `Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control `_, Islam et al, 2017.
-
-.. [#] `Deep Reinforcement Learning that Matters `_, Henderson et al, 2017.
-
-.. [#] `Where Did My Optimum Go?: An Empirical Analysis of Gradient Descent Optimization in Policy Gradient Methods `_, Henderson et al, 2018.
-
-.. [#] `Are Deep Policy Gradient Algorithms Truly Policy Gradient Algorithms? `_, Ilyas et al, 2018.
-
-.. [#] `Simple Random Search Provides a Competitive Approach to Reinforcement Learning `_, Mania et al, 2018.
-
-13. Bonus: Classic Papers in RL Theory or Review
-================================================
-
-.. [#] `Policy Gradient Methods for Reinforcement Learning with Function Approximation `_, Sutton et al, 2000. **Contributions:** Established policy gradient theorem and showed convergence of policy gradient algorithm for arbitrary policy classes.
-
-.. [#] `An Analysis of Temporal-Difference Learning with Function Approximation `_, Tsitsiklis and Van Roy, 1997. **Contributions:** Variety of convergence results and counter-examples for value-learning methods in RL.
-
-.. [#] `Reinforcement Learning of Motor Skills with Policy Gradients `_, Peters and Schaal, 2008. **Contributions:** Thorough review of policy gradient methods at the time, many of which are still serviceable descriptions of deep RL methods.
-
-.. [#] `Approximately Optimal Approximate Reinforcement Learning `_, Kakade and Langford, 2002. **Contributions:** Early roots for monotonic improvement theory, later leading to theoretical justification for TRPO and other algorithms.
-
-.. [#] `A Natural Policy Gradient `_, Kakade, 2002. **Contributions:** Brought natural gradients into RL, later leading to TRPO, ACKTR, and several other methods in deep RL.
-
-.. [#] `Algorithms for Reinforcement Learning `_, Szepesvari, 2009. **Contributions:** Unbeatable reference on RL before deep RL, containing foundations and theoretical background.
diff --git a/docs/_build/html/_sources/spinningup/rl_intro.rst.txt b/docs/_build/html/_sources/spinningup/rl_intro.rst.txt
deleted file mode 100644
index 2c159ca09..000000000
--- a/docs/_build/html/_sources/spinningup/rl_intro.rst.txt
+++ /dev/null
@@ -1,434 +0,0 @@
-==========================
-Part 1: Key Concepts in RL
-==========================
-
-
-.. contents:: Table of Contents
- :depth: 2
-
-Welcome to our introduction to reinforcement learning! Here, we aim to acquaint you with
-
-* the language and notation used to discuss the subject,
-* a high-level explanation of what RL algorithms do (although we mostly avoid the question of *how* they do it),
-* and a little bit of the core math that underlies the algorithms.
-
-In a nutshell, RL is the study of agents and how they learn by trial and error. It formalizes the idea that rewarding or punishing an agent for its behavior makes it more likely to repeat or forego that behavior in the future.
-
-
-What Can RL Do?
-===============
-
-RL methods have recently enjoyed a wide variety of successes. For example, it's been used to teach computers to control robots in simulation...
-
-.. raw:: html
-
-
-
-...and in the real world...
-
-.. raw:: html
-
-
-
-
-
-
-
-It's also famously been used to create breakthrough AIs for sophisticated strategy games, most notably `Go`_ and `Dota`_, taught computers to `play Atari games`_ from raw pixels, and trained simulated robots `to follow human instructions`_.
-
-.. _`Go`: https://deepmind.com/research/alphago/
-.. _`Dota`: https://blog.openai.com/openai-five/
-.. _`play Atari games`: https://deepmind.com/research/dqn/
-.. _`to follow human instructions`: https://blog.openai.com/deep-reinforcement-learning-from-human-preferences/
-
-
-Key Concepts and Terminology
-============================
-
-.. figure:: ../images/rl_diagram_transparent_bg.png
- :align: center
-
- Agent-environment interaction loop.
-
-The main characters of RL are the **agent** and the **environment**. The environment is the world that the agent lives in and interacts with. At every step of interaction, the agent sees a (possibly partial) observation of the state of the world, and then decides on an action to take. The environment changes when the agent acts on it, but may also change on its own.
-
-The agent also perceives a **reward** signal from the environment, a number that tells it how good or bad the current world state is. The goal of the agent is to maximize its cumulative reward, called **return**. Reinforcement learning methods are ways that the agent can learn behaviors to achieve its goal.
-
-To talk more specifically what RL does, we need to introduce additional terminology. We need to talk about
-
-* states and observations,
-* action spaces,
-* policies,
-* trajectories,
-* different formulations of return,
-* the RL optimization problem,
-* and value functions.
-
-
-States and Observations
------------------------
-
-A **state** :math:`s` is a complete description of the state of the world. There is no information about the world which is hidden from the state. An **observation** :math:`o` is a partial description of a state, which may omit information.
-
-In deep RL, we almost always represent states and observations by a `real-valued vector, matrix, or higher-order tensor`_. For instance, a visual observation could be represented by the RGB matrix of its pixel values; the state of a robot might be represented by its joint angles and velocities.
-
-When the agent is able to observe the complete state of the environment, we say that the environment is **fully observed**. When the agent can only see a partial observation, we say that the environment is **partially observed**.
-
-.. admonition:: You Should Know
-
- Reinforcement learning notation sometimes puts the symbol for state, :math:`s`, in places where it would be technically more appropriate to write the symbol for observation, :math:`o`. Specifically, this happens when talking about how the agent decides an action: we often signal in notation that the action is conditioned on the state, when in practice, the action is conditioned on the observation because the agent does not have access to the state.
-
- In our guide, we'll follow standard conventions for notation, but it should be clear from context which is meant. If something is unclear, though, please raise an issue! Our goal is to teach, not to confuse.
-
-.. _`real-valued vector, matrix, or higher-order tensor`: https://en.wikipedia.org/wiki/Real_coordinate_space
-
-
-Action Spaces
--------------
-
-Different environments allow different kinds of actions. The set of all valid actions in a given environment is often called the **action space**. Some environments, like Atari and Go, have **discrete action spaces**, where only a finite number of moves are available to the agent. Other environments, like where the agent controls a robot in a physical world, have **continuous action spaces**. In continuous spaces, actions are real-valued vectors.
-
-This distinction has some quite-profound consequences for methods in deep RL. Some families of algorithms can only be directly applied in one case, and would have to be substantially reworked for the other.
-
-
-Policies
---------
-
-A **policy** is a rule used by an agent to decide what actions to take. It can be deterministic, in which case it is usually denoted by :math:`\mu`:
-
-.. math::
-
- a_t = \mu(s_t),
-
-or it may be stochastic, in which case it is usually denoted by :math:`\pi`:
-
-.. math::
-
- a_t \sim \pi(\cdot | s_t).
-
-Because the policy is essentially the agent's brain, it's not uncommon to substitute the word "policy" for "agent", eg saying "The policy is trying to maximize reward."
-
-In deep RL, we deal with **parameterized policies**: policies whose outputs are computable functions that depend on a set of parameters (eg the weights and biases of a neural network) which we can adjust to change the behavior via some optimization algorithm.
-
-We often denote the parameters of such a policy by :math:`\theta` or :math:`\phi`, and then write this as a subscript on the policy symbol to highlight the connection:
-
-.. math::
-
- a_t &= \mu_{\theta}(s_t) \\
- a_t &\sim \pi_{\theta}(\cdot | s_t).
-
-
-Deterministic Policies
-^^^^^^^^^^^^^^^^^^^^^^
-
-**Example: Deterministic Policies.** Here is a code snippet for building a simple deterministic policy for a continuous action space in Tensorflow:
-
-.. code-block:: python
-
- obs = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
- net = mlp(obs, hidden_dims=(64,64), activation=tf.tanh)
- actions = tf.layers.dense(net, units=act_dim, activation=None)
-
-where ``mlp`` is a function that stacks multiple ``dense`` layers on top of each other with the given sizes and activation.
-
-Stochastic Policies
-^^^^^^^^^^^^^^^^^^^
-
-The two most common kinds of stochastic policies in deep RL are **categorical policies** and **diagonal Gaussian policies**.
-
-`Categorical`_ policies can be used in discrete action spaces, while diagonal `Gaussian`_ policies are used in continuous action spaces.
-
-Two key computations are centrally important for using and training stochastic policies:
-
-* sampling actions from the policy,
-* and computing log likelihoods of particular actions, :math:`\log \pi_{\theta}(a|s)`.
-
-In what follows, we'll describe how to do these for both categorical and diagonal Gaussian policies.
-
-.. admonition:: Categorical Policies
-
- A categorical policy is like a classifier over discrete actions. You build the neural network for a categorical policy the same way you would for a classifier: the input is the observation, followed by some number of layers (possibly convolutional or densely-connected, depending on the kind of input), and then you have one final linear layer that gives you logits for each action, followed by a `softmax`_ to convert the logits into probabilities.
-
- **Sampling.** Given the probabilities for each action, frameworks like Tensorflow have built-in tools for sampling. For example, see the `tf.distributions.Categorical`_ documentation, or `tf.multinomial`_.
-
- **Log-Likelihood.** Denote the last layer of probabilities as :math:`P_{\theta}(s)`. It is a vector with however many entries as there are actions, so we can treat the actions as indices for the vector. The log likelihood for an action :math:`a` can then be obtained by indexing into the vector:
-
- .. math::
-
- \log \pi_{\theta}(a|s) = \log \left[P_{\theta}(s)\right]_a.
-
-
-.. admonition:: Diagonal Gaussian Policies
-
- A multivariate Gaussian distribution (or multivariate normal distribution, if you prefer) is described by a mean vector, :math:`\mu`, and a covariance matrix, :math:`\Sigma`. A diagonal Gaussian distribution is a special case where the covariance matrix only has entries on the diagonal. As a result, we can represent it by a vector.
-
- A diagonal Gaussian policy always has a neural network that maps from observations to mean actions, :math:`\mu_{\theta}(s)`. There are two different ways that the covariance matrix is typically represented.
-
- **The first way:** There is a single vector of log standard deviations, :math:`\log \sigma`, which is **not** a function of state: the :math:`\log \sigma` are standalone parameters. (You Should Know: our implementations of VPG, TRPO, and PPO do it this way.)
-
- **The second way:** There is a neural network that maps from states to log standard deviations, :math:`\log \sigma_{\theta}(s)`. It may optionally share some layers with the mean network.
-
- Note that in both cases we output log standard deviations instead of standard deviations directly. This is because log stds are free to take on any values in :math:`(-\infty, \infty)`, while stds must be nonnegative. It's easier to train parameters if you don't have to enforce those kinds of constraints. The standard deviations can be obtained immediately from the log standard deviations by exponentiating them, so we do not lose anything by representing them this way.
-
- **Sampling.** Given the mean action :math:`\mu_{\theta}(s)` and standard deviation :math:`\sigma_{\theta}(s)`, and a vector :math:`z` of noise from a spherical Gaussian (:math:`z \sim \mathcal{N}(0, I)`), an action sample can be computed with
-
- .. math::
-
- a = \mu_{\theta}(s) + \sigma_{\theta}(s) \odot z,
-
- where :math:`\odot` denotes the elementwise product of two vectors. Standard frameworks have built-in ways to compute the noise vectors, such as `tf.random_normal`_. Alternatively, you can just provide the mean and standard deviation directly to a `tf.distributions.Normal`_ object and use that to sample.
-
- **Log-Likelihood.** The log-likelihood of a :math:`k` -dimensional action :math:`a`, for a diagonal Gaussian with mean :math:`\mu = \mu_{\theta}(s)` and standard deviation :math:`\sigma = \sigma_{\theta}(s)`, is given by
-
- .. math::
-
- \log \pi_{\theta}(a|s) = -\frac{1}{2}\left(\sum_{i=1}^k \left(\frac{(a_i - \mu_i)^2}{\sigma_i^2} + 2 \log \sigma_i \right) + k \log 2\pi \right).
-
-
-
-.. _`Categorical`: https://en.wikipedia.org/wiki/Categorical_distribution
-.. _`Gaussian`: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
-.. _`softmax`: https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax
-.. _`tf.distributions.Categorical`: https://www.tensorflow.org/api_docs/python/tf/distributions/Categorical
-.. _`tf.multinomial`: https://www.tensorflow.org/api_docs/python/tf/multinomial
-.. _`tf.random_normal`: https://www.tensorflow.org/api_docs/python/tf/random_normal
-.. _`tf.distributions.Normal`: https://www.tensorflow.org/api_docs/python/tf/distributions/Normal
-
-Trajectories
-------------
-
-A trajectory :math:`\tau` is a sequence of states and actions in the world,
-
-.. math::
-
- \tau = (s_0, a_0, s_1, a_1, ...).
-
-The very first state of the world, :math:`s_0`, is randomly sampled from the **start-state distribution**, sometimes denoted by :math:`\rho_0`:
-
-.. math::
-
- s_0 \sim \rho_0(\cdot).
-
-State transitions (what happens to the world between the state at time :math:`t`, :math:`s_t`, and the state at :math:`t+1`, :math:`s_{t+1}`), are governed by the natural laws of the environment, and depend on only the most recent action, :math:`a_t`. They can be either deterministic,
-
-.. math::
-
- s_{t+1} = f(s_t, a_t)
-
-or stochastic,
-
-.. math::
-
- s_{t+1} \sim P(\cdot|s_t, a_t).
-
-Actions come from an agent according to its policy.
-
-.. admonition:: You Should Know
-
- Trajectories are also frequently called **episodes** or **rollouts**.
-
-
-Reward and Return
------------------
-
-The reward function :math:`R` is critically important in reinforcement learning. It depends on the current state of the world, the action just taken, and the next state of the world:
-
-.. math::
-
- r_t = R(s_t, a_t, s_{t+1})
-
-although frequently this is simplified to just a dependence on the current state, :math:`r_t = R(s_t)`, or state-action pair :math:`r_t = R(s_t,a_t)`.
-
-The goal of the agent is to maximize some notion of cumulative reward over a trajectory, but this actually can mean a few things. We'll notate all of these cases with :math:`R(\tau)`, and it will either be clear from context which case we mean, or it won't matter (because the same equations will apply to all cases).
-
-One kind of return is the **finite-horizon undiscounted return**, which is just the sum of rewards obtained in a fixed window of steps:
-
-.. math::
-
- R(\tau) = \sum_{t=0}^T r_t.
-
-Another kind of return is the **infinite-horizon discounted return**, which is the sum of all rewards *ever* obtained by the agent, but discounted by how far off in the future they're obtained. This formulation of reward includes a discount factor :math:`\gamma \in (0,1)`:
-
-.. math::
-
- R(\tau) = \sum_{t=0}^{\infty} \gamma^t r_t.
-
-
-Why would we ever want a discount factor, though? Don't we just want to get *all* rewards? We do, but the discount factor is both intuitively appealing and mathematically convenient. On an intuitive level: cash now is better than cash later. Mathematically: an infinite-horizon sum of rewards `may not converge`_ to a finite value, and is hard to deal with in equations. But with a discount factor and under reasonable conditions, the infinite sum converges.
-
-.. admonition:: You Should Know
-
- While the line between these two formulations of return are quite stark in RL formalism, deep RL practice tends to blur the line a fair bit---for instance, we frequently set up algorithms to optimize the undiscounted return, but use discount factors in estimating **value functions**.
-
-.. _`may not converge`: https://en.wikipedia.org/wiki/Convergent_series
-
-The RL Problem
---------------
-
-Whatever the choice of return measure (whether infinite-horizon discounted, or finite-horizon undiscounted), and whatever the choice of policy, the goal in RL is to select a policy which maximizes **expected return** when the agent acts according to it.
-
-To talk about expected return, we first have to talk about probability distributions over trajectories.
-
-Let's suppose that both the environment transitions and the policy are stochastic. In this case, the probability of a :math:`T` -step trajectory is:
-
-.. math::
-
- P(\tau|\pi) = \rho_0 (s_0) \prod_{t=0}^{T-1} P(s_{t+1} | s_t, a_t) \pi(a_t | s_t).
-
-
-The expected return (for whichever measure), denoted by :math:`J(\pi)`, is then:
-
-.. math::
-
- J(\pi) = \int_{\tau} P(\tau|\pi) R(\tau) = \underE{\tau\sim \pi}{R(\tau)}.
-
-
-The central optimization problem in RL can then be expressed by
-
-.. math::
-
- \pi^* = \arg \max_{\pi} J(\pi),
-
-with :math:`\pi^*` being the **optimal policy**.
-
-
-Value Functions
----------------
-
-It's often useful to know the **value** of a state, or state-action pair. By value, we mean the expected return if you start in that state or state-action pair, and then act according to a particular policy forever after. **Value functions** are used, one way or another, in almost every RL algorithm.
-
-
-There are four main functions of note here.
-
-1. The **On-Policy Value Function**, :math:`V^{\pi}(s)`, which gives the expected return if you start in state :math:`s` and always act according to policy :math:`\pi`:
-
- .. math::
-
- V^{\pi}(s) = \underE{\tau \sim \pi}{R(\tau)\left| s_0 = s\right.}
-
-2. The **On-Policy Action-Value Function**, :math:`Q^{\pi}(s,a)`, which gives the expected return if you start in state :math:`s`, take an arbitrary action :math:`a` (which may not have come from the policy), and then forever after act according to policy :math:`\pi`:
-
- .. math::
-
- Q^{\pi}(s,a) = \underE{\tau \sim \pi}{R(\tau)\left| s_0 = s, a_0 = a\right.}
-
-
-3. The **Optimal Value Function**, :math:`V^*(s)`, which gives the expected return if you start in state :math:`s` and always act according to the *optimal* policy in the environment:
-
- .. math::
-
- V^*(s) = \max_{\pi} \underE{\tau \sim \pi}{R(\tau)\left| s_0 = s\right.}
-
-4. The **Optimal Action-Value Function**, :math:`Q^*(s,a)`, which gives the expected return if you start in state :math:`s`, take an arbitrary action :math:`a`, and then forever after act according to the *optimal* policy in the environment:
-
- .. math::
-
- Q^*(s,a) = \max_{\pi} \underE{\tau \sim \pi}{R(\tau)\left| s_0 = s, a_0 = a\right.}
-
-
-.. admonition:: You Should Know
-
- When we talk about value functions, if we do not make reference to time-dependence, we only mean expected **infinite-horizon discounted return**. Value functions for finite-horizon undiscounted return would need to accept time as an argument. Can you think about why? Hint: what happens when time's up?
-
-.. admonition:: You Should Know
-
- There are two key connections between the value function and the action-value function that come up pretty often:
-
- .. math::
-
- V^{\pi}(s) = \underE{a\sim \pi}{Q^{\pi}(s,a)},
-
- and
-
- .. math::
-
- V^*(s) = \max_a Q^* (s,a).
-
- These relations follow pretty directly from the definitions just given: can you prove them?
-
-The Optimal Q-Function and the Optimal Action
----------------------------------------------
-
-There is an important connection between the optimal action-value function :math:`Q^*(s,a)` and the action selected by the optimal policy. By definition, :math:`Q^*(s,a)` gives the expected return for starting in state :math:`s`, taking (arbitrary) action :math:`a`, and then acting according to the optimal policy forever after.
-
-The optimal policy in :math:`s` will select whichever action maximizes the expected return from starting in :math:`s`. As a result, if we have :math:`Q^*`, we can directly obtain the optimal action, :math:`a^*(s)`, via
-
-.. math::
-
- a^*(s) = \arg \max_a Q^* (s,a).
-
-Note: there may be multiple actions which maximize :math:`Q^*(s,a)`, in which case, all of them are optimal, and the optimal policy may randomly select any of them. But there is always an optimal policy which deterministically selects an action.
-
-
-Bellman Equations
------------------
-
-All four of the value functions obey special self-consistency equations called **Bellman equations**. The basic idea behind the Bellman equations is this:
-
- The value of your starting point is the reward you expect to get from being there, plus the value of wherever you land next.
-
-
-The Bellman equations for the on-policy value functions are
-
-.. math::
- :nowrap:
-
- \begin{align*}
- V^{\pi}(s) &= \underE{a \sim \pi \\ s'\sim P}{r(s,a) + \gamma V^{\pi}(s')}, \\
- Q^{\pi}(s,a) &= \underE{s'\sim P}{r(s,a) + \gamma \underE{a'\sim \pi}{Q^{\pi}(s',a')}},
- \end{align*}
-
-where :math:`s' \sim P` is shorthand for :math:`s' \sim P(\cdot |s,a)`, indicating that the next state :math:`s'` is sampled from the environment's transition rules; :math:`a \sim \pi` is shorthand for :math:`a \sim \pi(\cdot|s)`; and :math:`a' \sim \pi` is shorthand for :math:`a' \sim \pi(\cdot|s')`.
-
-The Bellman equations for the optimal value functions are
-
-.. math::
- :nowrap:
-
- \begin{align*}
- V^*(s) &= \max_a \underE{s'\sim P}{r(s,a) + \gamma V^*(s')}, \\
- Q^*(s,a) &= \underE{s'\sim P}{r(s,a) + \gamma \max_{a'} Q^*(s',a')}.
- \end{align*}
-
-The crucial difference between the Bellman equations for the on-policy value functions and the optimal value functions, is the absence or presence of the :math:`\max` over actions. Its inclusion reflects the fact that whenever the agent gets to choose its action, in order to act optimally, it has to pick whichever action leads to the highest value.
-
-.. admonition:: You Should Know
-
- The term "Bellman backup" comes up quite frequently in the RL literature. The Bellman backup for a state, or state-action pair, is the right-hand side of the Bellman equation: the reward-plus-next-value.
-
-
-Advantage Functions
--------------------
-
-Sometimes in RL, we don't need to describe how good an action is in an absolute sense, but only how much better it is than others on average. That is to say, we want to know the relative **advantage** of that action. We make this concept precise with the **advantage function.**
-
-The advantage function :math:`A^{\pi}(s,a)` corresponding to a policy :math:`\pi` describes how much better it is to take a specific action :math:`a` in state :math:`s`, over randomly selecting an action according to :math:`\pi(\cdot|s)`, assuming you act according to :math:`\pi` forever after. Mathematically, the advantage function is defined by
-
-.. math::
-
- A^{\pi}(s,a) = Q^{\pi}(s,a) - V^{\pi}(s).
-
-.. admonition:: You Should Know
-
- We'll discuss this more later, but the advantage function is crucially important to policy gradient methods.
-
-
-
-(Optional) Formalism
-====================
-
-So far, we've discussed the agent's environment in an informal way, but if you try to go digging through the literature, you're likely to run into the standard mathematical formalism for this setting: **Markov Decision Processes** (MDPs). An MDP is a 5-tuple, :math:`\langle S, A, R, P, \rho_0 \rangle`, where
-
-* :math:`S` is the set of all valid states,
-* :math:`A` is the set of all valid actions,
-* :math:`R : S \times A \times S \to \mathbb{R}` is the reward function, with :math:`r_t = R(s_t, a_t, s_{t+1})`,
-* :math:`P : S \times A \to \mathcal{P}(S)` is the transition probability function, with :math:`P(s'|s,a)` being the probability of transitioning into state :math:`s'` if you start in state :math:`s` and take action :math:`a`,
-* and :math:`\rho_0` is the starting state distribution.
-
-The name Markov Decision Process refers to the fact that the system obeys the `Markov property`_: transitions only depend on the most recent state and action, and no prior history.
-
-
-
-
-.. _`Markov property`: https://en.wikipedia.org/wiki/Markov_property
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/rl_intro2.rst.txt b/docs/_build/html/_sources/spinningup/rl_intro2.rst.txt
deleted file mode 100644
index 1fb3597b2..000000000
--- a/docs/_build/html/_sources/spinningup/rl_intro2.rst.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-==============================
-Part 2: Kinds of RL Algorithms
-==============================
-
-.. contents:: Table of Contents
- :depth: 2
-
-Now that we've gone through the basics of RL terminology and notation, we can cover a little bit of the richer material: the landscape of algorithms in modern RL, and a description of the kinds of trade-offs that go into algorithm design.
-
-A Taxonomy of RL Algorithms
-===========================
-
-.. figure:: ../images/rl_algorithms_9_15.svg
- :align: center
-
- A non-exhaustive, but useful taxonomy of algorithms in modern RL. `Citations below.`_
-
-We'll start this section with a disclaimer: it's really quite hard to draw an accurate, all-encompassing taxonomy of algorithms in the modern RL space, because the modularity of algorithms is not well-represented by a tree structure. Also, to make something that fits on a page and is reasonably digestible in an introduction essay, we have to omit quite a bit of more advanced material (exploration, transfer learning, meta learning, etc). That said, our goals here are
-
-* to highlight the most foundational design choices in deep RL algorithms about what to learn and how to learn it,
-* to expose the trade-offs in those choices,
-* and to place a few prominent modern algorithms into context with respect to those choices.
-
-Model-Free vs Model-Based RL
-----------------------------
-
-One of the most important branching points in an RL algorithm is the question of **whether the agent has access to (or learns) a model of the environment**. By a model of the environment, we mean a function which predicts state transitions and rewards.
-
-The main upside to having a model is that **it allows the agent to plan** by thinking ahead, seeing what would happen for a range of possible choices, and explicitly deciding between its options. Agents can then distill the results from planning ahead into a learned policy. A particularly famous example of this approach is `AlphaZero`_. When this works, it can result in a substantial improvement in sample efficiency over methods that don't have a model.
-
-The main downside is that **a ground-truth model of the environment is usually not available to the agent.** If an agent wants to use a model in this case, it has to learn the model purely from experience, which creates several challenges. The biggest challenge is that bias in the model can be exploited by the agent, resulting in an agent which performs well with respect to the learned model, but behaves sub-optimally (or super terribly) in the real environment. Model-learning is fundamentally hard, so even intense effort---being willing to throw lots of time and compute at it---can fail to pay off.
-
-Algorithms which use a model are called **model-based** methods, and those that don't are called **model-free**. While model-free methods forego the potential gains in sample efficiency from using a model, they tend to be easier to implement and tune. As of the time of writing this introduction (September 2018), model-free methods are more popular and have been more extensively developed and tested than model-based methods.
-
-
-What to Learn
--------------
-
-Another critical branching point in an RL algorithm is the question of **what to learn.** The list of usual suspects includes
-
-* policies, either stochastic or deterministic,
-* action-value functions (Q-functions),
-* value functions,
-* and/or environment models.
-
-
-
-What to Learn in Model-Free RL
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-There are two main approaches to representing and training agents with model-free RL:
-
-**Policy Optimization.** Methods in this family represent a policy explicitly as :math:`\pi_{\theta}(a|s)`. They optimize the parameters :math:`\theta` either directly by gradient ascent on the performance objective :math:`J(\pi_{\theta})`, or indirectly, by maximizing local approximations of :math:`J(\pi_{\theta})`. This optimization is almost always performed **on-policy**, which means that each update only uses data collected while acting according to the most recent version of the policy. Policy optimization also usually involves learning an approximator :math:`V_{\phi}(s)` for the on-policy value function :math:`V^{\pi}(s)`, which gets used in figuring out how to update the policy.
-
-A couple of examples of policy optimization methods are:
-
-* `A2C / A3C`_, which performs gradient ascent to directly maximize performance,
-* and `PPO`_, whose updates indirectly maximize performance, by instead maximizing a *surrogate objective* function which gives a conservative estimate for how much :math:`J(\pi_{\theta})` will change as a result of the update.
-
-**Q-Learning.** Methods in this family learn an approximator :math:`Q_{\theta}(s,a)` for the optimal action-value function, :math:`Q^*(s,a)`. Typically they use an objective function based on the `Bellman equation`_. This optimization is almost always performed **off-policy**, which means that each update can use data collected at any point during training, regardless of how the agent was choosing to explore the environment when the data was obtained. The corresponding policy is obtained via the connection between :math:`Q^*` and :math:`\pi^*`: the actions taken by the Q-learning agent are given by
-
-.. math::
-
- a(s) = \arg \max_a Q_{\theta}(s,a).
-
-Examples of Q-learning methods include
-
-* `DQN`_, a classic which substantially launched the field of deep RL,
-* and `C51`_, a variant that learns a distribution over return whose expectation is :math:`Q^*`.
-
-**Trade-offs Between Policy Optimization and Q-Learning.** The primary strength of policy optimization methods is that they are principled, in the sense that *you directly optimize for the thing you want.* This tends to make them stable and reliable. By contrast, Q-learning methods only *indirectly* optimize for agent performance, by training :math:`Q_{\theta}` to satisfy a self-consistency equation. There are many failure modes for this kind of learning, so it tends to be less stable. [1]_ But, Q-learning methods gain the advantage of being substantially more sample efficient when they do work, because they can reuse data more effectively than policy optimization techniques.
-
-**Interpolating Between Policy Optimization and Q-Learning.** Serendipitously, policy optimization and Q-learning are not incompatible (and under some circumstances, it turns out, `equivalent`_), and there exist a range of algorithms that live in between the two extremes. Algorithms that live on this spectrum are able to carefully trade-off between the strengths and weaknesses of either side. Examples include
-
-* `DDPG`_, an algorithm which concurrently learns a deterministic policy and a Q-function by using each to improve the other,
-* and `SAC`_, a variant which uses stochastic policies, entropy regularization, and a few other tricks to stabilize learning and score higher than DDPG on standard benchmarks.
-
-
-
-.. [1] For more information about how and why Q-learning methods can fail, see 1) this classic paper by `Tsitsiklis and van Roy`_, 2) the (much more recent) `review by Szepesvari`_ (in section 4.3.2), and 3) chapter 11 of `Sutton and Barto`_, especially section 11.3 (on "the deadly triad" of function approximation, bootstrapping, and off-policy data, together causing instability in value-learning algorithms).
-
-
-.. _`Bellman equation`: ../spinningup/rl_intro.html#bellman-equations
-.. _`Tsitsiklis and van Roy`: http://web.mit.edu/jnt/www/Papers/J063-97-bvr-td.pdf
-.. _`review by Szepesvari`: https://sites.ualberta.ca/~szepesva/papers/RLAlgsInMDPs.pdf
-.. _`Sutton and Barto`: https://drive.google.com/file/d/1xeUDVGWGUUv1-ccUMAZHJLej2C7aAFWY/view
-.. _`equivalent`: https://arxiv.org/abs/1704.06440
-
-What to Learn in Model-Based RL
--------------------------------
-
-Unlike model-free RL, there aren't a small number of easy-to-define clusters of methods for model-based RL: there are many orthogonal ways of using models. We'll give a few examples, but the list is far from exhaustive. In each case, the model may either be given or learned.
-
-**Background: Pure Planning.** The most basic approach *never* explicitly represents the policy, and instead, uses pure planning techniques like `model-predictive control`_ (MPC) to select actions. In MPC, each time the agent observes the environment, it computes a plan which is optimal with respect to the model, where the plan describes all actions to take over some fixed window of time after the present. (Future rewards beyond the horizon may be considered by the planning algorithm through the use of a learned value function.) The agent then executes the first action of the plan, and immediately discards the rest of it. It computes a new plan each time it prepares to interact with the environment, to avoid using an action from a plan with a shorter-than-desired planning horizon.
-
-* The `MBMF`_ work explores MPC with learned environment models on some standard benchmark tasks for deep RL.
-
-**Expert Iteration.** A straightforward follow-on to pure planning involves using and learning an explicit representation of the policy, :math:`\pi_{\theta}(a|s)`. The agent uses a planning algorithm (like Monte Carlo Tree Search) in the model, generating candidate actions for the plan by sampling from its current policy. The planning algorithm produces an action which is better than what the policy alone would have produced, hence it is an "expert" relative to the policy. The policy is afterwards updated to produce an action more like the planning algorithm's output.
-
-* The `ExIt`_ algorithm uses this approach to train deep neural networks to play Hex.
-* `AlphaZero`_ is another example of this approach.
-
-**Data Augmentation for Model-Free Methods.** Use a model-free RL algorithm to train a policy or Q-function, but either 1) augment real experiences with fictitious ones in updating the agent, or 2) use *only* fictitous experience for updating the agent.
-
-* See `MBVE`_ for an example of augmenting real experiences with fictitious ones.
-* See `World Models`_ for an example of using purely fictitious experience to train the agent, which they call "training in the dream."
-
-**Embedding Planning Loops into Policies.** Another approach embeds the planning procedure directly into a policy as a subroutine---so that complete plans become side information for the policy---while training the output of the policy with any standard model-free algorithm. The key concept is that in this framework, the policy can learn to choose how and when to use the plans. This makes model bias less of a problem, because if the model is bad for planning in some states, the policy can simply learn to ignore it.
-
-* See `I2A`_ for an example of agents being endowed with this style of imagination.
-
-.. _`model-predictive control`: https://en.wikipedia.org/wiki/Model_predictive_control
-.. _`ExIt`: https://arxiv.org/abs/1705.08439
-.. _`World Models`: https://worldmodels.github.io/
-
-
-
-Links to Algorithms in Taxonomy
-===============================
-
-.. _`Citations below.`:
-
-.. [#] `A2C / A3C `_ (Asynchronous Advantage Actor-Critic): Mnih et al, 2016
-.. [#] `PPO `_ (Proximal Policy Optimization): Schulman et al, 2017
-.. [#] `TRPO `_ (Trust Region Policy Optimization): Schulman et al, 2015
-.. [#] `DDPG `_ (Deep Deterministic Policy Gradient): Lillicrap et al, 2015
-.. [#] `TD3 `_ (Twin Delayed DDPG): Fujimoto et al, 2018
-.. [#] `SAC `_ (Soft Actor-Critic): Haarnoja et al, 2018
-.. [#] `DQN `_ (Deep Q-Networks): Mnih et al, 2013
-.. [#] `C51 `_ (Categorical 51-Atom DQN): Bellemare et al, 2017
-.. [#] `QR-DQN `_ (Quantile Regression DQN): Dabney et al, 2017
-.. [#] `HER `_ (Hindsight Experience Replay): Andrychowicz et al, 2017
-.. [#] `World Models`_: Ha and Schmidhuber, 2018
-.. [#] `I2A `_ (Imagination-Augmented Agents): Weber et al, 2017
-.. [#] `MBMF `_ (Model-Based RL with Model-Free Fine-Tuning): Nagabandi et al, 2017
-.. [#] `MBVE `_ (Model-Based Value Expansion): Feinberg et al, 2018
-.. [#] `AlphaZero `_: Silver et al, 2017
diff --git a/docs/_build/html/_sources/spinningup/rl_intro3.rst.txt b/docs/_build/html/_sources/spinningup/rl_intro3.rst.txt
deleted file mode 100644
index 0c96e47ab..000000000
--- a/docs/_build/html/_sources/spinningup/rl_intro3.rst.txt
+++ /dev/null
@@ -1,438 +0,0 @@
-====================================
-Part 3: Intro to Policy Optimization
-====================================
-
-.. contents:: Table of Contents
- :depth: 2
-
-
-In this section, we'll discuss the mathematical foundations of policy optimization algorithms, and connect the material to sample code. We will cover three key results in the theory of **policy gradients**:
-
-* `the simplest equation`_ describing the gradient of policy performance with respect to policy parameters,
-* a rule which allows us to `drop useless terms`_ from that expression,
-* and a rule which allows us to `add useful terms`_ to that expression.
-
-In the end, we'll tie those results together and describe the advantage-based expression for the policy gradient---the version we use in our `Vanilla Policy Gradient`_ implementation.
-
-.. _`the simplest equation`: ../spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient
-.. _`drop useless terms`: ../spinningup/rl_intro3.html#don-t-let-the-past-distract-you
-.. _`add useful terms`: ../spinningup/rl_intro3.html#baselines-in-policy-gradients
-.. _`Vanilla Policy Gradient`: ../algorithms/vpg.html
-
-Deriving the Simplest Policy Gradient
-=====================================
-
-Here, we consider the case of a stochastic, parameterized policy, :math:`\pi_{\theta}`. We aim to maximize the expected return :math:`J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{R(\tau)}`. For the purposes of this derivation, we'll take :math:`R(\tau)` to give the `finite-horizon undiscounted return`_, but the derivation for the infinite-horizon discounted return setting is almost identical.
-
-.. _`finite-horizon undiscounted return`: ../spinningup/rl_intro.html#reward-and-return
-
-We would like to optimize the policy by gradient ascent, eg
-
-.. math::
-
- \theta_{k+1} = \theta_k + \alpha \left. \nabla_{\theta} J(\pi_{\theta}) \right|_{\theta_k}.
-
-The gradient of policy performance, :math:`\nabla_{\theta} J(\pi_{\theta})`, is called the **policy gradient**, and algorithms that optimize the policy this way are called **policy gradient algorithms.** (Examples include Vanilla Policy Gradient and TRPO. PPO is often referred to as a policy gradient algorithm, though this is slightly inaccurate.)
-
-To actually use this algorithm, we need an expression for the policy gradient which we can numerically compute. This involves two steps: 1) deriving the analytical gradient of policy performance, which turns out to have the form of an expected value, and then 2) forming a sample estimate of that expected value, which can be computed with data from a finite number of agent-environment interaction steps.
-
-In this subsection, we'll find the simplest form of that expression. In later subsections, we'll show how to improve on the simplest form to get the version we actually use in standard policy gradient implementations.
-
-We'll begin by laying out a few facts which are useful for deriving the analytical gradient.
-
-**1. Probability of a Trajectory.** The probability of a trajectory :math:`\tau = (s_0, a_0, ..., s_{T+1})` given that actions come from :math:`\pi_{\theta}` is
-
-.. math::
-
- P(\tau|\theta) = \rho_0 (s_0) \prod_{t=0}^{T} P(s_{t+1}|s_t, a_t) \pi_{\theta}(a_t |s_t).
-
-
-**2. The Log-Derivative Trick.** The log-derivative trick is based on a simple rule from calculus: the derivative of :math:`\log x` with respect to :math:`x` is :math:`1/x`. When rearranged and combined with chain rule, we get:
-
-.. math::
-
- \nabla_{\theta} P(\tau | \theta) = P(\tau | \theta) \nabla_{\theta} \log P(\tau | \theta).
-
-
-**3. Log-Probability of a Trajectory.** The log-prob of a trajectory is just
-
-.. math::
-
- \log P(\tau|\theta) = \log \rho_0 (s_0) + \sum_{t=0}^{T} \bigg( \log P(s_{t+1}|s_t, a_t) + \log \pi_{\theta}(a_t |s_t)\bigg).
-
-
-**4. Gradients of Environment Functions.** The environment has no dependence on :math:`\theta`, so gradients of :math:`\rho_0(s_0)`, :math:`P(s_{t+1}|s_t, a_t)`, and :math:`R(\tau)` are zero.
-
-**5. Grad-Log-Prob of a Trajectory.** The gradient of the log-prob of a trajectory is thus
-
-.. math::
-
- \nabla_{\theta} \log P(\tau | \theta) &= \cancel{\nabla_{\theta} \log \rho_0 (s_0)} + \sum_{t=0}^{T} \bigg( \cancel{\nabla_{\theta} \log P(s_{t+1}|s_t, a_t)} + \nabla_{\theta} \log \pi_{\theta}(a_t |s_t)\bigg) \\
- &= \sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t).
-
-
-Putting it all together, we derive the following:
-
-.. admonition:: Derivation for Basic Policy Gradient
-
- .. math::
- :nowrap:
-
- \begin{align*}
- \nabla_{\theta} J(\pi_{\theta}) &= \nabla_{\theta} \underE{\tau \sim \pi_{\theta}}{R(\tau)} & \\
- &= \nabla_{\theta} \int_{\tau} P(\tau|\theta) R(\tau) & \text{Expand expectation} \\
- &= \int_{\tau} \nabla_{\theta} P(\tau|\theta) R(\tau) & \text{Bring gradient under integral} \\
- &= \int_{\tau} P(\tau|\theta) \nabla_{\theta} \log P(\tau|\theta) R(\tau) & \text{Log-derivative trick} \\
- &= \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log P(\tau|\theta) R(\tau)} & \text{Return to expectation form} \\
- \therefore \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)} & \text{Expression for grad-log-prob}
- \end{align*}
-
-This is an expectation, which means that we can estimate it with a sample mean. If we collect a set of trajectories :math:`\mathcal{D} = \{\tau_i\}_{i=1,...,N}` where each trajectory is obtained by letting the agent act in the environment using the policy :math:`\pi_{\theta}`, the policy gradient can be estimated with
-
-.. math::
-
- \hat{g} = \frac{1}{|\mathcal{D}|} \sum_{\tau \in \mathcal{D}} \sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau),
-
-where :math:`|\mathcal{D}|` is the number of trajectories in :math:`\mathcal{D}` (here, :math:`N`).
-
-This last expression is the simplest version of the computable expression we desired. Assuming that we have represented our policy in a way which allows us to calculate :math:`\nabla_{\theta} \log \pi_{\theta}(a|s)`, and if we are able to run the policy in the environment to collect the trajectory dataset, we can compute the policy gradient and take an update step.
-
-Implementing the Simplest Policy Gradient
-=========================================
-
-We give a short Tensorflow implementation of this simple version of the policy gradient algorithm in ``spinup/examples/pg_math/1_simple_pg.py``. (It can also be viewed `on github `_.) It is only 122 lines long, so we highly recommend reading through it in depth. While we won't go through the entirety of the code here, we'll highlight and explain a few important pieces.
-
-**1. Making the Policy Network.**
-
-.. code-block:: python
- :linenos:
- :lineno-start: 25
-
- # make core of policy network
- obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
- logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
-
- # make action selection op (outputs int actions, sampled from policy)
- actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)
-
-This block builds a feedforward neural network categorical policy. (See the `Stochastic Policies`_ section in Part 1 for a refresher.) The ``logits`` tensor can be used to construct log-probabilities and probabilities for actions, and the ``actions`` tensor samples actions based on the probabilities implied by ``logits``.
-
-.. _`Stochastic Policies`: ../spinningup/rl_intro.html#stochastic-policies
-
-**2. Making the Loss Function.**
-
-.. code-block:: python
- :linenos:
- :lineno-start: 32
-
- # make loss function whose gradient, for the right data, is policy gradient
- weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
- act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
- action_masks = tf.one_hot(act_ph, n_acts)
- log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
- loss = -tf.reduce_mean(weights_ph * log_probs)
-
-
-In this block, we build a "loss" function for the policy gradient algorithm. When the right data is plugged in, the gradient of this loss is equal to the policy gradient. The right data means a set of (state, action, weight) tuples collected while acting according to the current policy, where the weight for a state-action pair is the return from the episode to which it belongs. (Although as we will show in later subsections, there are other values you can plug in for the weight which also work correctly.)
-
-
-.. admonition:: You Should Know
-
- Even though we describe this as a loss function, it is **not** a loss function in the typical sense from supervised learning. There are two main differences from standard loss functions.
-
- **1. The data distribution depends on the parameters.** A loss function is usually defined on a fixed data distribution which is independent of the parameters we aim to optimize. Not so here, where the data must be sampled on the most recent policy.
-
- **2. It doesn't measure performance.** A loss function usually evaluates the performance metric that we care about. Here, we care about expected return, :math:`J(\pi_{\theta})`, but our "loss" function does not approximate this at all, even in expectation. This "loss" function is only useful to us because, when evaluated at the current parameters, with data generated by the current parameters, it has the negative gradient of performance.
-
- But after that first step of gradient descent, there is no more connection to performance. This means that minimizing this "loss" function, for a given batch of data, has *no* guarantee whatsoever of improving expected return. You can send this loss to :math:`-\infty` and policy performance could crater; in fact, it usually will. Sometimes a deep RL researcher might describe this outcome as the policy "overfitting" to a batch of data. This is descriptive, but should not be taken literally because it does not refer to generalization error.
-
- We raise this point because it is common for ML practitioners to interpret a loss function as a useful signal during training---"if the loss goes down, all is well." In policy gradients, this intuition is wrong, and you should only care about average return. The loss function means nothing.
-
-
-
-
-.. admonition:: You Should Know
-
- The approach used here to make the ``log_probs`` tensor---creating an action mask, and using it to select out particular log probabilities---*only* works for categorical policies. It does not work in general.
-
-
-
-**3. Running One Epoch of Training.**
-
-.. code-block:: python
- :linenos:
- :lineno-start: 45
-
- # for training policy
- def train_one_epoch():
- # make some empty lists for logging.
- batch_obs = [] # for observations
- batch_acts = [] # for actions
- batch_weights = [] # for R(tau) weighting in policy gradient
- batch_rets = [] # for measuring episode returns
- batch_lens = [] # for measuring episode lengths
-
- # reset episode-specific variables
- obs = env.reset() # first obs comes from starting distribution
- done = False # signal from environment that episode is over
- ep_rews = [] # list for rewards accrued throughout ep
-
- # render first episode of each epoch
- finished_rendering_this_epoch = False
-
- # collect experience by acting in the environment with current policy
- while True:
-
- # rendering
- if not(finished_rendering_this_epoch):
- env.render()
-
- # save obs
- batch_obs.append(obs.copy())
-
- # act in the environment
- act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
- obs, rew, done, _ = env.step(act)
-
- # save action, reward
- batch_acts.append(act)
- ep_rews.append(rew)
-
- if done:
- # if episode is over, record info about episode
- ep_ret, ep_len = sum(ep_rews), len(ep_rews)
- batch_rets.append(ep_ret)
- batch_lens.append(ep_len)
-
- # the weight for each logprob(a|s) is R(tau)
- batch_weights += [ep_ret] * ep_len
-
- # reset episode-specific variables
- obs, done, ep_rews = env.reset(), False, []
-
- # won't render again this epoch
- finished_rendering_this_epoch = True
-
- # end experience loop if we have enough of it
- if len(batch_obs) > batch_size:
- break
-
- # take a single policy gradient update step
- batch_loss, _ = sess.run([loss, train_op],
- feed_dict={
- obs_ph: np.array(batch_obs),
- act_ph: np.array(batch_acts),
- weights_ph: np.array(batch_weights)
- })
- return batch_loss, batch_rets, batch_lens
-
-The ``train_one_epoch()`` function runs one "epoch" of policy gradient, which we define to be
-
-1) the experience collection step (L62-97), where the agent acts for some number of episodes in the environment using the most recent policy, followed by
-
-2) a single policy gradient update step (L99-105).
-
-The main loop of the algorithm just repeatedly calls ``train_one_epoch()``.
-
-
-Expected Grad-Log-Prob Lemma
-============================
-
-In this subsection, we will derive an intermediate result which is extensively used throughout the theory of policy gradients. We will call it the Expected Grad-Log-Prob (EGLP) lemma. [1]_
-
-**EGLP Lemma.** Suppose that :math:`P_{\theta}` is a parameterized probability distribution over a random variable, :math:`x`. Then:
-
-.. math::
-
- \underE{x \sim P_{\theta}}{\nabla_{\theta} \log P_{\theta}(x)} = 0.
-
-.. admonition:: Proof
-
- Recall that all probability distributions are *normalized*:
-
- .. math::
-
- \int_x P_{\theta}(x) = 1.
-
- Take the gradient of both sides of the normalization condition:
-
- .. math::
-
- \nabla_{\theta} \int_x P_{\theta}(x) = \nabla_{\theta} 1 = 0.
-
- Use the log derivative trick to get:
-
- .. math::
-
- 0 &= \nabla_{\theta} \int_x P_{\theta}(x) \\
- &= \int_x \nabla_{\theta} P_{\theta}(x) \\
- &= \int_x P_{\theta}(x) \nabla_{\theta} \log P_{\theta}(x) \\
- \therefore 0 &= \underE{x \sim P_{\theta}}{\nabla_{\theta} \log P_{\theta}(x)}.
-
-.. [1] The author of this article is not aware of this lemma being given a standard name anywhere in the literature. But given how often it comes up, it seems pretty worthwhile to give it some kind of name for ease of reference.
-
-Don't Let the Past Distract You
-===============================
-
-Examine our most recent expression for the policy gradient:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)}.
-
-Taking a step with this gradient pushes up the log-probabilities of each action in proportion to :math:`R(\tau)`, the sum of *all rewards ever obtained*. But this doesn't make much sense.
-
-Agents should really only reinforce actions on the basis of their *consequences*. Rewards obtained before taking an action have no bearing on how good that action was: only rewards that come *after*.
-
-It turns out that this intuition shows up in the math, and we can show that the policy gradient can also be expressed by
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1})}.
-
-In this form, actions are only reinforced based on rewards obtained after they are taken.
-
-We'll call this form the "reward-to-go policy gradient," because the sum of rewards after a point in a trajectory,
-
-.. math::
-
- \hat{R}_t \doteq \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1}),
-
-is called the **reward-to-go** from that point, and this policy gradient expression depends on the reward-to-go from state-action pairs.
-
-.. admonition:: You Should Know
-
- **But how is this better?** A key problem with policy gradients is how many sample trajectories are needed to get a low-variance sample estimate for them. The formula we started with included terms for reinforcing actions proportional to past rewards, all of which had zero mean, but nonzero variance: as a result, they would just add noise to sample estimates of the policy gradient. By removing them, we reduce the number of sample trajectories needed.
-
-An (optional) proof of this claim can be found `here`_, and it ultimately depends on the EGLP lemma.
-
-.. _`here`: ../spinningup/extra_pg_proof1.html
-
-Implementing Reward-to-Go Policy Gradient
-=========================================
-
-We give a short Tensorflow implementation of the reward-to-go policy gradient in ``spinup/examples/pg_math/2_rtg_pg.py``. (It can also be viewed `on github `_.)
-
-The only thing that has changed from ``1_simple_pg.py`` is that we now use different weights in the loss function. The code modification is very slight: we add a new function, and change two other lines. The new function is:
-
-.. code-block:: python
- :linenos:
- :lineno-start: 12
-
- def reward_to_go(rews):
- n = len(rews)
- rtgs = np.zeros_like(rews)
- for i in reversed(range(n)):
- rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
- return rtgs
-
-
-And then we tweak the old L86-87 from:
-
-.. code-block:: python
- :linenos:
- :lineno-start: 86
-
- # the weight for each logprob(a|s) is R(tau)
- batch_weights += [ep_ret] * ep_len
-
-to:
-
-.. code-block:: python
- :linenos:
- :lineno-start: 93
-
- # the weight for each logprob(a_t|s_t) is reward-to-go from t
- batch_weights += list(reward_to_go(ep_rews))
-
-
-
-Baselines in Policy Gradients
-=============================
-
-An immediate consequence of the EGLP lemma is that for any function :math:`b` which only depends on state,
-
-.. math::
-
- \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t|s_t) b(s_t)} = 0.
-
-This allows us to add or subtract any number of terms like this from our expression for the policy gradient, without changing it in expectation:
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \left(\sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1}) - b(s_t)\right)}.
-
-Any function :math:`b` used in this way is called a **baseline**.
-
-The most common choice of baseline is the `on-policy value function`_ :math:`V^{\pi}(s_t)`. Recall that this is the average return an agent gets if it starts in state :math:`s_t` and then acts according to policy :math:`\pi` for the rest of its life.
-
-Empirically, the choice :math:`b(s_t) = V^{\pi}(s_t)` has the desirable effect of reducing variance in the sample estimate for the policy gradient. This results in faster and more stable policy learning. It is also appealing from a conceptual angle: it encodes the intuition that if an agent gets what it expected, it should "feel" neutral about it.
-
-.. admonition:: You Should Know
-
- In practice, :math:`V^{\pi}(s_t)` cannot be computed exactly, so it has to be approximated. This is usually done with a neural network, :math:`V_{\phi}(s_t)`, which is updated concurrently with the policy (so that the value network always approximates the value function of the most recent policy).
-
- The simplest method for learning :math:`V_{\phi}`, used in most implementations of policy optimization algorithms (including VPG, TRPO, PPO, and A2C), is to minimize a mean-squared-error objective:
-
- .. math:: \phi_k = \arg \min_{\phi} \underE{s_t, \hat{R}_t \sim \pi_k}{\left( V_{\phi}(s_t) - \hat{R}_t \right)^2},
-
- |
- where :math:`\pi_k` is the policy at epoch :math:`k`. This is done with one or more steps of gradient descent, starting from the previous value parameters :math:`\phi_{k-1}`.
-
-
-Other Forms of the Policy Gradient
-==================================
-
-What we have seen so far is that the policy gradient has the general form
-
-.. math::
-
- \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Phi_t},
-
-where :math:`\Phi_t` could be any of
-
-.. math:: \Phi_t &= R(\tau),
-
-or
-
-.. math:: \Phi_t &= \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1}),
-
-or
-
-.. math:: \Phi_t &= \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1}) - b(s_t).
-
-All of these choices lead to the same expected value for the policy gradient, despite having different variances. It turns out that there are two more valid choices of weights :math:`\Phi_t` which are important to know.
-
-**1. On-Policy Action-Value Function.** The choice
-
-.. math:: \Phi_t = Q^{\pi_{\theta}}(s_t, a_t)
-
-is also valid. See `this page`_ for an (optional) proof of this claim.
-
-**2. The Advantage Function.** Recall that the `advantage of an action`_, defined by :math:`A^{\pi}(s_t,a_t) = Q^{\pi}(s_t,a_t) - V^{\pi}(s_t)`, describes how much better or worse it is than other actions on average (relative to the current policy). This choice,
-
-.. math:: \Phi_t = A^{\pi_{\theta}}(s_t, a_t)
-
-is also valid. The proof is that it's equivalent to using :math:`\Phi_t = Q^{\pi_{\theta}}(s_t, a_t)` and then using a value function baseline, which we are always free to do.
-
-.. admonition:: You Should Know
-
- The formulation of policy gradients with advantage functions is extremely common, and there are many different ways of estimating the advantage function used by different algorithms.
-
-.. admonition:: You Should Know
-
- For a more detailed treatment of this topic, you should read the paper on `Generalized Advantage Estimation`_ (GAE), which goes into depth about different choices of :math:`\Phi_t` in the background sections.
-
- That paper then goes on to describe GAE, a method for approximating the advantage function in policy optimization algorithms which enjoys widespread use. For instance, Spinning Up's implementations of VPG, TRPO, and PPO make use of it. As a result, we strongly advise you to study it.
-
-
-Recap
-=====
-
-In this chapter, we described the basic theory of policy gradient methods and connected some of the early results to code examples. The interested student should continue from here by studying how the later results (value function baselines and the advantage formulation of policy gradients) translate into Spinning Up's implementation of `Vanilla Policy Gradient`_.
-
-.. _`on-policy value function`: ../spinningup/rl_intro.html#value-functions
-.. _`advantage of an action`: ../spinningup/rl_intro.html#advantage-functions
-.. _`this page`: ../spinningup/extra_pg_proof2.html
-.. _`Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438
-.. _`Vanilla Policy Gradient`: ../algorithms/vpg.html
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/rl_intro4.rst.txt b/docs/_build/html/_sources/spinningup/rl_intro4.rst.txt
deleted file mode 100644
index 9b7a48efc..000000000
--- a/docs/_build/html/_sources/spinningup/rl_intro4.rst.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-=========================
-Limitations and Frontiers
-=========================
-
-
-Reward Design
-=============
-
-
-Sample Complexity
-=================
-
-
-Long-Horizon Tasks
-==================
\ No newline at end of file
diff --git a/docs/_build/html/_sources/spinningup/spinningup.rst.txt b/docs/_build/html/_sources/spinningup/spinningup.rst.txt
deleted file mode 100644
index ef0d46493..000000000
--- a/docs/_build/html/_sources/spinningup/spinningup.rst.txt
+++ /dev/null
@@ -1,250 +0,0 @@
-===================================
-Spinning Up as a Deep RL Researcher
-===================================
-By Joshua Achiam, October 13th, 2018
-
-
-.. contents:: Table of Contents
- :depth: 2
-
-If you’re an aspiring deep RL researcher, you’ve probably heard all kinds of things about deep RL by this point. You know that `it’s hard and it doesn’t always work`_. That even when you’re following a recipe, `reproducibility`_ `is a challenge`_. And that if you’re starting from scratch, `the learning curve is incredibly steep`_. It’s also the case that there are a lot of `great`_ `resources`_ `out`_ `there`_, but the material is new enough that there’s not a clear, well-charted path to mastery. The goal of this column is to help you get past the initial hurdle, and give you a clear sense of how to spin up as a deep RL researcher. In particular, this will outline a useful curriculum for increasing raw knowledge, while interleaving it with the odds and ends that lead to better research.
-
-
-The Right Background
-====================
-
-**Build up a solid mathematical background.** From probability and statistics, feel comfortable with random variables, Bayes’ theorem, chain rule of probability, expected values, standard deviations, and importance sampling. From multivariate calculus, understand gradients and (optionally, but it’ll help) Taylor series expansions.
-
-**Build up a general knowledge of deep learning.** You don’t need to know every single special trick and architecture, but the basics help. Know about standard architectures (`MLP`_, `vanilla RNN`_, `LSTM`_ (`also see this blog`_), `GRU`_, `conv`_ `layers`_, `resnets`_, `attention`_ `mechanisms`_), common regularizers (`weight decay`_, `dropout`_), normalization (`batch norm`_, `layer norm`_, `weight norm`_), and optimizers (`SGD, momentum SGD`_, `Adam`_, `others`_). Know what the `reparameterization trick`_ is.
-
-**Become familiar with at least one deep learning library.** `Tensorflow`_ or `PyTorch`_ would be a good place to start. You don’t need to know how to do everything, but you should feel pretty confident in implementing a simple program to do supervised learning.
-
-**Get comfortable with the main concepts and terminology in RL.** Know what states, actions, trajectories, policies, rewards, value functions, and action-value functions are. If you're unfamiliar, Spinning Up ships with `an introduction`_ to this material; it's also worth checking out the `RL-Intro`_ from the OpenAI Hackathon, or the exceptional and thorough `overview by Lilian Weng`_. Optionally, if you’re the sort of person who enjoys mathematical theory, study up on the math of `monotonic improvement theory`_ (which forms the basis for advanced policy gradient algorithms), or `classical RL algorithms`_ (which despite being superseded by deep RL algorithms, contain valuable insights that sometimes drive new research).
-
-Learn by Doing
-==============
-
-**Write your own implementations.** You should implement as many of the core deep RL algorithms from scratch as you can, with the aim of writing the shortest correct implementation of each. This is by far the best way to develop an understanding of how they work, as well as intuitions for their specific performance characteristics.
-
-**Simplicity is critical.** You should organize your efforts so that you implement the simplest algorithms first, and only gradually introduce complexity. If you start off trying to build something with too many moving parts, odds are good that it will break and you'll lose weeks trying to debug it. This is a common failure mode for people who are new to deep RL, and if you find yourself stuck in it, don't be discouraged---but do try to change tack and work on a simpler algorithm instead, before returning to the more complex thing later.
-
-**Which algorithms?** You should probably start with vanilla policy gradient (also called `REINFORCE`_), `DQN`_, `A2C`_ (the synchronous version of `A3C`_), `PPO`_ (the variant with the clipped objective), and `DDPG`_, approximately in that order. The simplest versions of all of these can be written in just a few hundred lines of code (ballpark 250-300), and some of them even less (for example, `a no-frills version of VPG`_ can be written in about 80 lines). Write single-threaded code before you try writing parallelized versions of these algorithms. (Do try to parallelize at least one.)
-
-**Focus on understanding.** Writing working RL code requires clear, detail-oriented understanding of the algorithms. This is because **broken RL code almost always fails silently,** where the code appears to run fine except that the agent never learns how to solve the task. Usually the problem is that something is being calculated with the wrong equation, or on the wrong distribution, or data is being piped into the wrong place. Sometimes the only way to find these bugs is to read the code with a critical eye, know exactly what it should be doing, and find where it deviates from the correct behavior. Developing that knowledge requires you to engage with both academic literature and other existing implementations (when possible), so a good amount of your time should be spent on that reading.
-
-**What to look for in papers:** When implementing an algorithm based on a paper, scour that paper, especially the ablation analyses and supplementary material (where available). The ablations will give you an intuition for what parameters or subroutines have the biggest impact on getting things to work, which will help you diagnose bugs. Supplementary material will often give information about specific details like network architecture and optimization hyperparameters, and you should try to align your implementation to these details to improve your chances of getting it working.
-
-**But don't overfit to paper details.** Sometimes, the paper prescribes the use of more tricks than are strictly necessary, so be a bit wary of this, and try out simplifications where possible. For example, the original DDPG paper suggests a complex neural network architecture and initialization scheme, as well as batch normalization. These aren't strictly necessary, and some of the best-reported results for DDPG use simpler networks. As another example, the original A3C paper uses asynchronous updates from the various actor-learners, but it turns out that synchronous updates work about as well.
-
-**Don't overfit to existing implementations either.** Study `existing`_ `implementations`_ for inspiration, but be careful not to overfit to the engineering details of those implementations. RL libraries frequently make choices for abstraction that are good for code reuse between algorithms, but which are unnecessary if you're only writing a single algorithm or supporting a single use case.
-
-**Iterate fast in simple environments.** To debug your implementations, try them with simple environments where learning should happen quickly, like CartPole-v0, InvertedPendulum-v0, FrozenLake-v0, and HalfCheetah-v2 (with a short time horizon---only 100 or 250 steps instead of the full 1000) from the `OpenAI Gym`_. Don’t try to run an algorithm in Atari or a complex Humanoid environment if you haven’t first verified that it works on the simplest possible toy task. Your ideal experiment turnaround-time at the debug stage is <5 minutes (on your local machine) or slightly longer but not much. These small-scale experiments don't require any special hardware, and can be run without too much trouble on CPUs.
-
-**If it doesn't work, assume there's a bug.** Spend a lot of effort searching for bugs before you resort to tweaking hyperparameters: usually it’s a bug. Bad hyperparameters can significantly degrade RL performance, but if you're using hyperparameters similar to the ones in papers and standard implementations, those will probably not be the issue. Also worth keeping in mind: sometimes things will work in one environment even when you have a breaking bug, so make sure to test in more than one environment once your results look promising.
-
-**Measure everything.** Do a lot of instrumenting to see what’s going on under-the-hood. The more stats about the learning process you read out at each iteration, the easier it is to debug---after all, you can’t tell it’s broken if you can’t see that it’s breaking. I personally like to look at the mean/std/min/max for cumulative rewards, episode lengths, and value function estimates, along with the losses for the objectives, and the details of any exploration parameters (like mean entropy for stochastic policy optimization, or current epsilon for epsilon-greedy as in DQN). Also, watch videos of your agent’s performance every now and then; this will give you some insights you wouldn’t get otherwise.
-
-**Scale experiments when things work.** After you have an implementation of an RL algorithm that seems to work correctly in the simplest environments, test it out on harder environments. Experiments at this stage will take longer---on the order of somewhere between a few hours and a couple of days, depending. Specialized hardware---like a beefy GPU or a 32-core machine---might be useful at this point, and you should consider looking into cloud computing resources like AWS or GCE.
-
-**Keep these habits!** These habits are worth keeping beyond the stage where you’re just learning about deep RL---they will accelerate your research!
-
-Developing a Research Project
-=============================
-
-Once you feel reasonably comfortable with the basics in deep RL, you should start pushing on the boundaries and doing research. To get there, you'll need an idea for a project.
-
-**Start by exploring the literature to become aware of topics in the field.** There are a wide range of topics you might find interesting: sample efficiency, exploration, transfer learning, hierarchy, memory, model-based RL, meta learning, and multi-agent, to name a few. If you're looking for inspiration, or just want to get a rough sense of what's out there, check out Spinning Up's `key papers <../spinningup/keypapers.html>`_ list. Find a paper that you enjoy on one of these subjects---something that inspires you---and read it thoroughly. Use the related work section and citations to find closely-related papers and do a deep dive in the literature. You’ll start to figure out where the unsolved problems are and where you can make an impact.
-
-**Approaches to idea-generation:** There are a many different ways to start thinking about ideas for projects, and the frame you choose influences how the project might evolve and what risks it will face. Here are a few examples:
-
-**Frame 1: Improving on an Existing Approach.** This is the incrementalist angle, where you try to get performance gains in an established problem setting by tweaking an existing algorithm. Reimplementing prior work is super helpful here, because it exposes you to the ways that existing algorithms are brittle and could be improved. A novice will find this the most accessible frame, but it can also be worthwhile for researchers at any level of experience. While some researchers find incrementalism less exciting, some of the most impressive achievements in machine learning have come from work of this nature.
-
-Because projects like these are tied to existing methods, they are by nature narrowly scoped and can wrap up quickly (a few months), which may be desirable (especially when starting out as a researcher). But this also sets up the risks: it's possible that the tweaks you have in mind for an algorithm may fail to improve it, in which case, unless you come up with more tweaks, the project is just over and you have no clear signal on what to do next.
-
-**Frame 2: Focusing on Unsolved Benchmarks.** Instead of thinking about how to improve an existing method, you aim to succeed on a task that no one has solved before. For example: achieving perfect generalization from training levels to test levels in the `Sonic domain`_ or `Gym Retro`_. When you hammer away at an unsolved task, you might try a wide variety of methods, including prior approaches and new ones that you invent for the project. It is possible for a novice to approch this kind of problem, but there will be a steeper learning curve.
-
-Projects in this frame have a broad scope and can go on for a while (several months to a year-plus). The main risk is that the benchmark is unsolvable without a substantial breakthrough, meaning that it would be easy to spend a lot of time without making any progress on it. But even if a project like this fails, it often leads the researcher to many new insights that become fertile soil for the next project.
-
-
-**Frame 3: Create a New Problem Setting.** Instead of thinking about existing methods or current grand challenges, think of an entirely different conceptual problem that hasn't been studied yet. Then, figure out how to make progress on it. For projects along these lines, a standard benchmark probably doesn't exist yet, and you will have to design one. This can be a huge challenge, but it’s worth embracing---great benchmarks move the whole field forward.
-
-Problems in this frame come up when they come up---it's hard to go looking for them.
-
-**Avoid reinventing the wheel.** When you come up with a good idea that you want to start testing, that’s great! But while you’re still in the early stages with it, do the most thorough check you can to make sure it hasn’t already been done. It can be pretty disheartening to get halfway through a project, and only then discover that there's already a paper about your idea. It's especially frustrating when the work is concurrent, which happens from time to time! But don’t let that deter you---and definitely don’t let it motivate you to plant flags with not-quite-finished research and over-claim the merits of the partial work. Do good research and finish out your projects with complete and thorough investigations, because that’s what counts, and by far what matters most in the long run.
-
-
-Doing Rigorous Research in RL
-=============================
-
-Now you’ve come up with an idea, and you’re fairly certain it hasn’t been done. You use the skills you’ve developed to implement it and you start testing it out on standard domains. It looks like it works! But what does that mean, and how well does it have to work to be important? This is one of the hardest parts of research in deep RL. In order to validate that your proposal is a meaningful contribution, you have to rigorously prove that it actually gets a performance benefit over the strongest possible baseline algorithm---whatever currently achieves SOTA (state of the art) on your test domains. If you’ve invented a new test domain, so there’s no previous SOTA, you still need to try out whatever the most reliable algorithm in the literature is that could plausibly do well in the new test domain, and then you have to beat that.
-
-**Set up fair comparisons.** If you implement your baseline from scratch---as opposed to comparing against another paper’s numbers directly---it’s important to spend as much time tuning your baseline as you spend tuning your own algorithm. This will make sure that comparisons are fair. Also, do your best to hold “all else equal” even if there are substantial differences between your algorithm and the baseline. For example, if you’re investigating architecture variants, keep the number of model parameters approximately equal between your model and the baseline. Under no circumstances handicap the baseline! It turns out that the baselines in RL are pretty strong, and getting big, consistent wins over them can be tricky or require some good insight in algorithm design.
-
-**Remove stochasticity as a confounder.** Beware of random seeds making things look stronger or weaker than they really are, so run everything for many random seeds (at least 3, but if you want to be thorough, do 10 or more). This is really important and deserves a lot of emphasis: deep RL seems fairly brittle with respect to random seed in a lot of common use cases. There’s potentially enough variance that two different groups of random seeds can yield learning curves with differences so significant that they look like they don’t come from the same distribution at all (see `figure 10 here`_).
-
-**Run high-integrity experiments.** Don’t just take the results from the best or most interesting runs to use in your paper. Instead, launch new, final experiments---for all of the methods that you intend to compare (if you are comparing against your own baseline implementations)---and precommit to report on whatever comes out of that. This is to enforce a weak form of `preregistration`_: you use the tuning stage to come up with your hypotheses, and you use the final runs to come up with your conclusions.
-
-**Check each claim separately.** Another critical aspect of doing research is to run an ablation analysis. Any method you propose is likely to have several key design decisions---like architecture choices or regularization techniques, for instance---each of which could separately impact performance. The claim you'll make in your work is that those design decisions collectively help, but this is really a bundle of several claims in disguise: one for each such design element. By systematically evaluating what would happen if you were to swap them out with alternate design choices, or remove them entirely, you can figure out how to correctly attribute credit for the benefits your method confers. This lets you make each separate claim with a measure of confidence, and increases the overall strength of your work.
-
-Closing Thoughts
-================
-
-Deep RL is an exciting, fast-moving field, and we need as many people as possible to go through the open problems and make progress on them. Hopefully, you feel a bit more prepared to be a part of it after reading this! And whenever you’re ready, `let us know`_.
-
-.. _`let us know`: https://jobs.lever.co/openai
-
-
-PS: Other Resources
-===================
-
-Consider reading through these other informative articles about growing as a researcher or engineer in this field:
-
-`Advice for Short-term Machine Learning Research Projects `_, by Tim Rocktäschel, Jakob Foerster and Greg Farquhar.
-
-`ML Engineering for AI Safety & Robustness: a Google Brain Engineer’s Guide to Entering the Field `_, by Catherine Olsson and 80,000 Hours.
-
-References
-==========
-
-.. _`it’s hard and it doesn’t always work`: https://www.alexirpan.com/2018/02/14/rl-hard.html
-.. [1] `Deep Reinforcement Learning Doesn't Work Yet `_, Alex Irpan, 2018
-
-.. _`reproducibility`: https://arxiv.org/abs/1708.04133
-.. _`figure 10 here`: https://arxiv.org/pdf/1708.04133.pdf
-.. [2] `Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control `_, Islam et al, 2017
-
-.. _`is a challenge`: https://arxiv.org/abs/1709.06560
-.. [3] `Deep Reinforcement Learning that Matters `_, Henderson et al, 2017
-
-.. _`the learning curve is incredibly steep`: http://amid.fish/reproducing-deep-rl
-.. [4] `Lessons Learned Reproducing a Deep Reinforcement Learning Paper `_, Matthew Rahtz, 2018
-
-.. _`great`: http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html
-.. [5] `UCL Course on RL `_
-
-.. _`resources`: http://rll.berkeley.edu/deeprlcourse/
-.. [6] `Berkeley Deep RL Course `_
-
-.. _`out`: https://sites.google.com/view/deep-rl-bootcamp/lectures
-.. [7] `Deep RL Bootcamp `_
-
-.. _`there`: http://joschu.net/docs/nuts-and-bolts.pdf
-.. [8] `Nuts and Bolts of Deep RL `_, John Schulman
-
-.. _`MLP`: http://ufldl.stanford.edu/tutorial/supervised/MultiLayerNeuralNetworks/
-.. [9] `Stanford Deep Learning Tutorial: Multi-Layer Neural Network `_
-
-.. _`Vanilla RNN`: http://karpathy.github.io/2015/05/21/rnn-effectiveness/
-.. [10] `The Unreasonable Effectiveness of Recurrent Neural Networks `_, Andrej Karpathy, 2015
-
-.. _`LSTM`: https://arxiv.org/abs/1503.04069
-.. [11] `LSTM: A Search Space Odyssey `_, Greff et al, 2015
-
-.. _`also see this blog`: http://colah.github.io/posts/2015-08-Understanding-LSTMs/
-.. [12] `Understanding LSTM Networks `_, Chris Olah, 2015
-
-.. _`GRU`: https://arxiv.org/abs/1412.3555v1
-.. [13] `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling `_, Chung et al, 2014 (GRU paper)
-
-.. _`conv`: http://colah.github.io/posts/2014-07-Conv-Nets-Modular/
-.. [14] `Conv Nets: A Modular Perspective `_, Chris Olah, 2014
-
-.. _`layers`: https://cs231n.github.io/convolutional-networks/
-.. [15] `Stanford CS231n, Convolutional Neural Networks for Visual Recognition `_
-
-.. _`resnets`: https://arxiv.org/abs/1512.03385
-.. [16] `Deep Residual Learning for Image Recognition `_, He et al, 2015 (ResNets)
-
-.. _`attention`: https://arxiv.org/abs/1409.0473
-.. [17] `Neural Machine Translation by Jointly Learning to Align and Translate `_, Bahdanau et al, 2014 (Attention mechanisms)
-
-.. _`mechanisms`: https://arxiv.org/abs/1706.03762
-.. [18] `Attention Is All You Need `_, Vaswani et al, 2017
-
-.. _`weight decay`: https://papers.nips.cc/paper/563-a-simple-weight-decay-can-improve-generalization.pdf
-.. [19] `A Simple Weight Decay Can Improve Generalization `_, Krogh and Hertz, 1992
-
-
-.. _`dropout`: http://jmlr.org/papers/volume15/srivastava14a.old/srivastava14a.pdf
-.. [20] `Dropout: A Simple Way to Prevent Neural Networks from Overfitting `_, Srivastava et al, 2014
-
-.. _`batch norm`: https://arxiv.org/abs/1502.03167
-.. [21] `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift `_, Ioffe and Szegedy, 2015
-
-.. _`layer norm`: https://arxiv.org/abs/1607.06450
-.. [22] `Layer Normalization `_, Ba et al, 2016
-
-.. _`weight norm`: https://arxiv.org/abs/1602.07868
-.. [23] `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks `_, Salimans and Kingma, 2016
-
-.. _`SGD, momentum SGD`: http://ufldl.stanford.edu/tutorial/supervised/OptimizationStochasticGradientDescent/
-.. [24] `Stanford Deep Learning Tutorial: Stochastic Gradient Descent `_
-
-.. _`Adam`: https://arxiv.org/abs/1412.6980
-.. [25] `Adam: A Method for Stochastic Optimization `_, Kingma and Ba, 2014
-
-.. _`others`: https://arxiv.org/abs/1609.04747
-.. [26] `An overview of gradient descent optimization algorithms `_, Sebastian Ruder, 2016
-
-.. _`reparameterization trick`: https://arxiv.org/abs/1312.6114
-.. [27] `Auto-Encoding Variational Bayes `_, Kingma and Welling, 2013 (Reparameterization trick)
-
-.. _`Tensorflow`: https://www.tensorflow.org/
-.. [28] `Tensorflow`_
-
-.. _`PyTorch`: http://pytorch.org/
-.. [29] `PyTorch`_
-
-.. _`an introduction`: ../spinningup/rl_intro.html
-.. [30] `Spinning Up in Deep RL: Introduction to RL, Part 1 <../spinningup/rl_intro.html>`_
-
-.. _`RL-Intro`: https://github.com/jachiam/rl-intro/blob/master/Presentation/rl_intro.pdf
-.. [31] `RL-Intro`_ Slides from OpenAI Hackathon, Josh Achiam, 2018
-
-.. _`overview by Lilian Weng`: https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html
-.. [32] `A (Long) Peek into Reinforcement Learning `_, Lilian Weng, 2018
-
-.. _`monotonic improvement theory`: http://joschu.net/docs/thesis.pdf
-.. [33] `Optimizing Expectations `_, John Schulman, 2016 (Monotonic improvement theory)
-
-.. _`classical RL algorithms`: https://sites.ualberta.ca/~szepesva/papers/RLAlgsInMDPs.pdf
-.. [34] `Algorithms for Reinforcement Learning `_, Csaba Szepesvari, 2009 (Classic RL Algorithms)
-
-.. _`REINFORCE`: https://arxiv.org/abs/1604.06778
-.. [35] `Benchmarking Deep Reinforcement Learning for Continuous Control `_, Duan et al, 2016
-
-.. _`DQN`: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
-.. [36] `Playing Atari with Deep Reinforcement Learning `_, Mnih et al, 2013 (DQN)
-
-.. _`A2C`: https://blog.openai.com/baselines-acktr-a2c/
-.. [37] `OpenAI Baselines: ACKTR & A2C `_
-
-.. _`A3C`: https://arxiv.org/abs/1602.01783
-.. [38] `Asynchronous Methods for Deep Reinforcement Learning `_, Mnih et al, 2016 (A3C)
-
-.. _`PPO`: https://arxiv.org/abs/1707.06347
-.. [39] `Proximal Policy Optimization Algorithms `_, Schulman et al, 2017 (PPO)
-
-.. _`DDPG`: https://arxiv.org/abs/1509.02971
-.. [40] `Continuous Control with Deep Reinforcement Learning `_, Lillicrap et al, 2015 (DDPG)
-
-.. _`a no-frills version of VPG`: https://github.com/jachiam/rl-intro/blob/master/pg_cartpole.py
-.. [41] `RL-Intro Policy Gradient Sample Code `_, Josh Achiam, 2018
-
-.. _`existing`: https://github.com/openai/baselines
-.. [42] `OpenAI Baselines `_
-
-.. _`implementations`: https://github.com/rll/rllab
-.. [43] `rllab `_
-
-.. _`OpenAI Gym`: https://gym.openai.com/
-.. [44] `OpenAI Gym `_
-
-.. _`Sonic domain`: https://contest.openai.com/2018-1/
-.. [45] `OpenAI Retro Contest `_
-
-.. _`Gym Retro`: https://blog.openai.com/gym-retro/
-.. [46] `OpenAI Gym Retro `_
-
-.. _`preregistration`: https://cos.io/prereg/
-.. [47] `Center for Open Science `_, explaining what preregistration means in the context of scientific experiments.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/user/algorithms.rst.txt b/docs/_build/html/_sources/user/algorithms.rst.txt
deleted file mode 100644
index 6275448f6..000000000
--- a/docs/_build/html/_sources/user/algorithms.rst.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-==========
-Algorithms
-==========
-
-.. contents:: Table of Contents
-
-What's Included
-===============
-
-The following algorithms are implemented in the Spinning Up package:
-
-- `Vanilla Policy Gradient`_ (VPG)
-- `Trust Region Policy Optimization`_ (TRPO)
-- `Proximal Policy Optimization`_ (PPO)
-- `Deep Deterministic Policy Gradient`_ (DDPG)
-- `Twin Delayed DDPG`_ (TD3)
-- `Soft Actor-Critic`_ (SAC)
-
-They are all implemented with `MLP`_ (non-recurrent) actor-critics, making them suitable for fully-observed, non-image-based RL environments, eg the `Gym Mujoco`_ environments.
-
-.. _`Gym Mujoco`: https://gym.openai.com/envs/#mujoco
-.. _`Vanilla Policy Gradient`: ../algorithms/vpg.html
-.. _`Trust Region Policy Optimization`: ../algorithms/trpo.html
-.. _`Proximal Policy Optimization`: ../algorithms/ppo.html
-.. _`Deep Deterministic Policy Gradient`: ../algorithms/ddpg.html
-.. _`Twin Delayed DDPG`: ../algorithms/td3.html
-.. _`Soft Actor-Critic`: ../algorithms/sac.html
-.. _`MLP`: https://en.wikipedia.org/wiki/Multilayer_perceptron
-
-
-Why These Algorithms?
-=====================
-
-We chose the core deep RL algorithms in this package to reflect useful progressions of ideas from the recent history of the field, culminating in two algorithms in particular---PPO and SAC---which are close to SOTA on reliability and sample efficiency among policy-learning algorithms. They also expose some of the trade-offs that get made in designing and using algorithms in deep RL.
-
-The On-Policy Algorithms
-------------------------
-
-Vanilla Policy Gradient is the most basic, entry-level algorithm in the deep RL space because it completely predates the advent of deep RL altogether. The core elements of VPG go all the way back to the late 80s / early 90s. It started a trail of research which ultimately led to stronger algorithms such as TRPO and then PPO soon after.
-
-A key feature of this line of work is that all of these algorithms are *on-policy*: that is, they don't use old data, which makes them weaker on sample efficiency. But this is for a good reason: these algorithms directly optimize the objective you care about---policy performance---and it works out mathematically that you need on-policy data to calculate the updates. So, this family of algorithms trades off sample efficiency in favor of stability---but you can see the progression of techniques (from VPG to TRPO to PPO) working to make up the deficit on sample efficiency.
-
-
-The Off-Policy Algorithms
--------------------------
-
-DDPG is a similarly foundational algorithm to VPG, although much younger---the theory of deterministic policy gradients, which led to DDPG, wasn't published until 2014. DDPG is closely connected to Q-learning algorithms, and it concurrently learns a Q-function and a policy which are updated to improve each other.
-
-Algorithms like DDPG and Q-Learning are *off-policy*, so they are able to reuse old data very efficiently. They gain this benefit by exploiting Bellman's equations for optimality, which a Q-function can be trained to satisfy using *any* environment interaction data (as long as there's enough experience from the high-reward areas in the environment).
-
-But problematically, there are no guarantees that doing a good job of satisfying Bellman's equations leads to having great policy performance. *Empirically* one can get great performance---and when it happens, the sample efficiency is wonderful---but the absence of guarantees makes algorithms in this class potentially brittle and unstable. TD3 and SAC are descendants of DDPG which make use of a variety of insights to mitigate these issues.
-
-
-Code Format
-===========
-
-All implementations in Spinning Up adhere to a standard template. They are split into two files: an algorithm file, which contains the core logic of the algorithm, and a core file, which contains various utilities needed to run the algorithm.
-
-The Algorithm File
-------------------
-
-The algorithm file always starts with a class definition for an experience buffer object, which is used to store information from agent-environment interactions.
-
-Next, there is a single function which runs the algorithm, performing the following tasks (in this order):
-
- 1) Logger setup
-
- 2) Random seed setting
-
- 3) Environment instantiation
-
- 4) Making placeholders for the computation graph
-
- 5) Building the actor-critic computation graph via the ``actor_critic`` function passed to the algorithm function as an argument
-
- 6) Instantiating the experience buffer
-
- 7) Building the computation graph for loss functions and diagnostics specific to the algorithm
-
- 8) Making training ops
-
- 9) Making the TF Session and initializing parameters
-
- 10) Setting up model saving through the logger
-
- 11) Defining functions needed for running the main loop of the algorithm (eg the core update function, get action function, and test agent function, depending on the algorithm)
-
- 12) Running the main loop of the algorithm:
-
- a) Run the agent in the environment
-
- b) Periodically update the parameters of the agent according to the main equations of the algorithm
-
- c) Log key performance metrics and save agent
-
-
-Finally, there's some support for directly running the algorithm in Gym environments from the command line.
-
-
-The Core File
--------------
-
-The core files don't adhere as closely as the algorithms files to a template, but do have some approximate structure:
-
- 1) Functions related to making and managing placeholders
-
- 2) Functions for building sections of computation graph relevant to the ``actor_critic`` method for a particular algorithm
-
- 3) Any other useful functions
-
- 4) Implementations for an MLP actor-critic compatible with the algorithm, where both the policy and the value function(s) are represented by simple MLPs
-
-
diff --git a/docs/_build/html/_sources/user/installation.rst.txt b/docs/_build/html/_sources/user/installation.rst.txt
deleted file mode 100644
index 1e2f8fafd..000000000
--- a/docs/_build/html/_sources/user/installation.rst.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-============
-Installation
-============
-
-
-.. contents:: Table of Contents
-
-Spinning Up requires Python3, OpenAI Gym, and OpenMPI.
-
-Spinning Up is currently only supported on Linux and OSX. It may be possible to install on Windows, though this hasn't been extensively tested. [#]_
-
-.. admonition:: You Should Know
-
- Many examples and benchmarks in Spinning Up refer to RL environments that use the `MuJoCo`_ physics engine. MuJoCo is a proprietary software that requires a license, which is free to trial and free for students, but otherwise is not free. As a result, installing it is optional, but because of its importance to the research community---it is the de facto standard for benchmarking deep RL algorithms in continuous control---it is preferred.
-
- Don't worry if you decide not to install MuJoCo, though. You can definitely get started in RL by running RL algorithms on the `Classic Control`_ and `Box2d`_ environments in Gym, which are totally free to use.
-
-.. [#] It looks like at least one person has figured out `a workaround for running on Windows`_. If you try another way and succeed, please let us know how you did it!
-
-.. _`Classic Control`: https://gym.openai.com/envs/#classic_control
-.. _`Box2d`: https://gym.openai.com/envs/#box2d
-.. _`MuJoCo`: http://www.mujoco.org/index.html
-.. _`a workaround for running on Windows`: https://github.com/openai/spinningup/issues/23
-
-Installing Python
-=================
-
-We recommend installing Python through Anaconda. Anaconda is a library that includes Python and many useful packages for Python, as well as an environment manager called conda that makes package management simple.
-
-Follow `the installation instructions`_ for Anaconda here. Download and install Anaconda 3.x (at time of writing, 3.6). Then create a conda env for organizing packages used in Spinning Up:
-
-.. parsed-literal::
-
- conda create -n spinningup python=3.6
-
-To use Python from the environment you just created, activate the environment with:
-
-.. parsed-literal::
-
- source activate spinningup
-
-.. admonition:: You Should Know
-
- If you're new to python environments and package management, this stuff can quickly get confusing or overwhelming, and you'll probably hit some snags along the way. (Especially, you should expect problems like, "I just installed this thing, but it says it's not found when I try to use it!") You may want to read through some clean explanations about what package management is, why it's a good idea, and what commands you'll typically have to execute to correctly use it.
-
- `FreeCodeCamp`_ has a good explanation worth reading. There's a shorter description on `Towards Data Science`_ which is also helpful and informative. Finally, if you're an extremely patient person, you may want to read the (dry, but very informative) `documentation page from Conda`_.
-
-.. caution::
-
- As of November 2018, there appears to be a bug which prevents the Tensorflow pip package from working in Python 3.7. To track, see `this Github issue for Tensorflow`_. As a result, in order to use Spinning Up (which requires Tensorflow), you should use Python 3.6.
-
-
-.. _`the installation instructions`: https://docs.continuum.io/anaconda/install/
-.. _`FreeCodeCamp`: https://medium.freecodecamp.org/why-you-need-python-environments-and-how-to-manage-them-with-conda-85f155f4353c
-.. _`Towards Data Science`: https://towardsdatascience.com/environment-management-with-conda-python-2-3-b9961a8a5097
-.. _`documentation page from Conda`: https://conda.io/docs/user-guide/tasks/manage-environments.html
-.. _`this Github issue for Tensorflow`: https://github.com/tensorflow/tensorflow/issues/20444
-
-
-Installing OpenMPI
-==================
-
-Ubuntu
-------
-
-.. parsed-literal::
-
- sudo apt-get update && sudo apt-get install libopenmpi-dev
-
-
-Mac OS X
---------
-Installation of system packages on Mac requires Homebrew_. With Homebrew installed, run the follwing:
-
-.. parsed-literal::
-
- brew install openmpi
-
-.. _Homebrew: https://brew.sh
-
-Installing Spinning Up
-======================
-
-.. parsed-literal::
-
- git clone https://github.com/openai/spinningup.git
- cd spinningup
- pip install -e .
-
-.. admonition:: You Should Know
-
- Spinning Up defaults to installing everything in Gym **except** the MuJoCo environments. In case you run into any trouble with the Gym installation, check out the `Gym`_ github page for help. If you want the MuJoCo environments, see the optional installation arguments below.
-
-.. _`Gym`: https://github.com/openai/gym
-
-Check Your Install
-==================
-
-To see if you've successfully installed Spinning Up, try running PPO in the LunarLander-v2 environment with
-
-.. parsed-literal::
-
- python -m spinup.run ppo --hid "[32,32]" --env LunarLander-v2 --exp_name installtest --gamma 0.999
-
-This might run for around 10 minutes, and you can leave it going in the background while you continue reading through documentation. This won't train the agent to completion, but will run it for long enough that you can see *some* learning progress when the results come in.
-
-After it finishes training, watch a video of the trained policy with
-
-.. parsed-literal::
-
- python -m spinup.run test_policy data/installtest/installtest_s0
-
-And plot the results with
-
-.. parsed-literal::
-
- python -m spinup.run plot data/installtest/installtest_s0
-
-
-Installing MuJoCo (Optional)
-============================
-
-First, go to the `mujoco-py`_ github page. Follow the installation instructions in the README, which describe how to install the MuJoCo physics engine and the mujoco-py package (which allows the use of MuJoCo from Python).
-
-.. admonition:: You Should Know
-
- In order to use the MuJoCo simulator, you will need to get a `MuJoCo license`_. Free 30-day licenses are available to anyone, and free 1-year licenses are available to full-time students.
-
-Once you have installed MuJoCo, install the corresponding Gym environments with
-
-.. parsed-literal::
-
- pip install gym[mujoco,robotics]
-
-And then check that things are working by running PPO in the Walker2d-v2 environment with
-
-.. parsed-literal::
-
- python -m spinup.run ppo --hid "[32,32]" --env Walker2d-v2 --exp_name mujocotest
-
-
-.. _`mujoco-py`: https://github.com/openai/mujoco-py
-.. _`MuJoCo license`: https://www.roboti.us/license.html
\ No newline at end of file
diff --git a/docs/_build/html/_sources/user/introduction.rst.txt b/docs/_build/html/_sources/user/introduction.rst.txt
deleted file mode 100644
index 8dd89efd6..000000000
--- a/docs/_build/html/_sources/user/introduction.rst.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-============
-Introduction
-============
-
-.. contents:: Table of Contents
-
-What This Is
-============
-
-Welcome to Spinning Up in Deep RL! This is an educational resource produced by OpenAI that makes it easier to learn about deep reinforcement learning (deep RL).
-
-For the unfamiliar: `reinforcement learning`_ (RL) is a machine learning approach for teaching agents how to solve tasks by trial and error. Deep RL refers to the combination of RL with `deep learning`_.
-
-This module contains a variety of helpful resources, including:
-
-- a short `introduction`_ to RL terminology, kinds of algorithms, and basic theory,
-- an `essay`_ about how to grow into an RL research role,
-- a `curated list`_ of important papers organized by topic,
-- a well-documented `code repo`_ of short, standalone implementations of key algorithms,
-- and a few `exercises`_ to serve as warm-ups.
-
-
-.. _`reinforcement learning`: https://en.wikipedia.org/wiki/Reinforcement_learning
-.. _`deep learning`: http://ufldl.stanford.edu/tutorial/
-
-Why We Built This
-=================
-
-One of the single most common questions that we hear is
-
- | If I want to contribute to AI safety, how do I get started?
-
-At OpenAI, we believe that deep learning generally---and deep reinforcement learning specifically---will play central roles in the development of powerful AI technology. To ensure that AI is safe, we have to come up with safety strategies and algorithms that are compatible with this paradigm. As a result, we encourage everyone who asks this question to study these fields.
-
-However, while there are many resources to help people quickly ramp up on deep learning, deep reinforcement learning is more challenging to break into. To begin with, a student of deep RL needs to have some background in math, coding, and regular deep learning. Beyond that, they need both a high-level view of the field---an awareness of what topics are studied in it, why they matter, and what's been done already---and careful instruction on how to connect algorithm theory to algorithm code.
-
-The high-level view is hard to come by because of how new the field is. There is not yet a standard deep RL textbook, so most of the knowledge is locked up in either papers or lecture series, which can take a long time to parse and digest. And learning to implement deep RL algorithms is typically painful, because either
-
-- the paper that publishes an algorithm omits or inadvertently obscures key design details,
-- or widely-public implementations of an algorithm are hard to read, hiding how the code lines up with the algorithm.
-
-While fantastic repos like rllab_, Baselines_, and rllib_ make it easier for researchers who are already in the field to make progress, they build algorithms into frameworks in ways that involve many non-obvious choices and trade-offs, which makes them hard to learn from. Consequently, the field of deep RL has a pretty high barrier to entry---for new researchers as well as practitioners and hobbyists.
-
-So our package here is designed to serve as the missing middle step for people who are excited by deep RL, and would like to learn how to use it or make a contribution, but don't have a clear sense of what to study or how to transmute algorithms into code. We've tried to make this as helpful a launching point as possible.
-
-That said, practitioners aren't the only people who can (or should) benefit from these materials. Solving AI safety will require people with a wide range of expertise and perspectives, and many relevant professions have no connection to engineering or computer science at all. Nonetheless, everyone involved will need to learn enough about the technology to make informed decisions, and several pieces of Spinning Up address that need.
-
-
-
-How This Serves Our Mission
-===========================
-
-OpenAI's mission_ is to ensure the safe development of AGI and the broad distribution of benefits from AI more generally. Teaching tools like Spinning Up help us make progress on both of these objectives.
-
-To begin with, we move closer to broad distribution of benefits any time we help people understand what AI is and how it works. This empowers people to think critically about the many issues we anticipate will arise as AI becomes more sophisticated and important in our lives.
-
-Also, critically, `we need people to help `_ us work on making sure that AGI is safe. This requires a skill set which is currently in short supply because of how new the field is. We know that many people are interested in helping us, but don't know how---here is what you should study! If you can become an expert on this material, you can make a difference on AI safety.
-
-
-
-Code Design Philosophy
-======================
-
-The algorithm implementations in the Spinning Up repo are designed to be
-
- - as simple as possible while still being reasonably good,
- - and highly-consistent with each other to expose fundamental similarities between algorithms.
-
-They are almost completely self-contained, with virtually no common code shared between them (except for logging, saving, loading, and `MPI `_ utilities), so that an interested person can study each algorithm separately without having to dig through an endless chain of dependencies to see how something is done. The implementations are patterned so that they come as close to pseudocode as possible, to minimize the gap between theory and code.
-
-Importantly, they're all structured similarly, so if you clearly understand one, jumping into the next is painless.
-
-We tried to minimize the number of tricks used in each algorithm's implementation, and minimize the differences between otherwise-similar algorithms. To give some examples of removed tricks: we omit regularization_ terms present in the original Soft-Actor Critic code, as well as `observation normalization`_ from all algorithms. For an example of where we've removed differences between algorithms: our implementations of DDPG, TD3, and SAC all follow a convention laid out in the `original TD3 code`_, where all gradient descent updates are performed at the ends of episodes (instead of happening all throughout the episode).
-
-All algorithms are "reasonably good" in the sense that they achieve roughly the intended performance, but don't necessarily match the best reported results in the literature on every task. Consequently, be careful if using any of these implementations for scientific benchmarking comparisons. Details on each implementation's specific performance level can be found on our `benchmarks`_ page.
-
-
-Support Plan
-============
-
-We plan to support Spinning Up to ensure that it serves as a helpful resource for learning about deep reinforcement learning. The exact nature of long-term (multi-year) support for Spinning Up is yet to be determined, but in the short run, we commit to:
-
-- High-bandwidth support for the first three weeks after release (Nov 8, 2018 to Nov 29, 2018).
-
- + We'll move quickly on bug-fixes, question-answering, and modifications to the docs to clear up ambiguities.
- + We'll work hard to streamline the user experience, in order to make it as easy as possible to self-study with Spinning Up.
-
-- Approximately six months after release (in April 2019), we'll do a serious review of the state of the package based on feedback we receive from the community, and announce any plans for future modification, including a long-term roadmap.
-
-Additionally, as discussed in the blog post, we are using Spinning Up in the curriculum for our upcoming cohorts of Scholars_ and Fellows_. Any changes and updates we make for their benefit will immediately become public as well.
-
-
-.. _`introduction`: ../spinningup/rl_intro.html
-.. _`essay`: ../spinningup/spinningup.html
-.. _`Spinning Up essay`: ../spinningup/spinningup.html
-.. _`curated list`: ../spinningup/keypapers.html
-.. _`code repo`: https://github.com/openai/spinningup
-.. _`exercises`: ../spinningup/exercises.html
-.. _`rllab`: https://github.com/rll/rllab
-.. _`Baselines`: https://github.com/openai/baselines
-.. _`rllib`: https://github.com/ray-project/ray/tree/master/python/ray/rllib
-.. _`mission`: https://blog.openai.com/openai-charter/
-.. _`regularization`: https://github.com/haarnoja/sac/blob/108a4229be6f040360fcca983113df9c4ac23a6a/sac/distributions/normal.py#L69
-.. _`observation normalization`: https://github.com/openai/baselines/blob/28aca637d0f13f4415cc5ebb778144154cff3110/baselines/run.py#L131
-.. _`original TD3 code`: https://github.com/sfujim/TD3/blob/25dfc0a6562c54ae5575fad5b8f08bc9d5c4e26c/main.py#L89
-.. _`benchmarks`: ../spinningup/bench.html
-.. _Scholars : https://jobs.lever.co/openai/cf6de4ed-4afd-4ace-9273-8842c003c842
-.. _Fellows : https://jobs.lever.co/openai/c9ba3f64-2419-4ff9-b81d-0526ae059f57
diff --git a/docs/_build/html/_sources/user/plotting.rst.txt b/docs/_build/html/_sources/user/plotting.rst.txt
deleted file mode 100644
index 7b294c2a5..000000000
--- a/docs/_build/html/_sources/user/plotting.rst.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-================
-Plotting Results
-================
-
-Spinning Up ships with a simple plotting utility for interpreting results. Run it with:
-
-.. parsed-literal::
-
- python -m spinup.run plot [path/to/output_directory ...] [--legend [LEGEND ...]]
- [--xaxis XAXIS] [--value [VALUE ...]] [--count] [--smooth S]
- [--select [SEL ...]] [--exclude [EXC ...]]
-
-
-**Positional Arguments:**
-
-.. option:: logdir
-
- *strings*. As many log directories (or prefixes to log directories, which the plotter will autocomplete internally) as you'd like to plot from. Logdirs will be searched recursively for experiment outputs.
-
- .. admonition:: You Should Know
-
- The internal autocompleting is really handy! Suppose you have run several experiments, with the aim of comparing performance between different algorithms, resulting in a log directory structure of:
-
- .. parsed-literal::
-
- data/
- bench_algo1/
- bench_algo1-seed0/
- bench_algo1-seed10/
- bench_algo2/
- bench_algo2-seed0/
- bench_algo2-seed10/
-
- You can easily produce a graph comparing algo1 and algo2 with:
-
- .. parsed-literal::
-
- python spinup/utils/plot.py data/bench_algo
-
- relying on the autocomplete to find both ``data/bench_algo1`` and ``data/bench_algo2``.
-
-**Optional Arguments:**
-
-.. option:: -l, --legend=[LEGEND ...]
-
- *strings*. Optional way to specify legend for the plot. The plotter legend will automatically use the ``exp_name`` from the ``config.json`` file, unless you tell it otherwise through this flag. This only works if you provide a name for each directory that will get plotted. (Note: this may not be the same as the number of logdir args you provide! Recall that the plotter looks for autocompletes of the logdir args: there may be more than one match for a given logdir prefix, and you will need to provide a legend string for each one of those matches---unless you have removed some of them as candidates via selection or exclusion rules (below).)
-
-.. option:: -x, --xaxis=XAXIS, default='TotalEnvInteracts'
-
- *string*. Pick what column from data is used for the x-axis.
-
-.. option:: -y, --value=[VALUE ...], default='Performance'
-
- *strings*. Pick what columns from data to graph on the y-axis. Submitting multiple values will produce multiple graphs. Defaults to ``Performance``, which is not an actual output of any algorithm. Instead, ``Performance`` refers to either ``AverageEpRet``, the correct performance measure for the on-policy algorithms, or ``AverageTestEpRet``, the correct performance measure for the off-policy algorithms. The plotter will automatically figure out which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for each separate logdir.
-
-.. option:: --count
-
- Optional flag. By default, the plotter shows y-values which are averaged across all results that share an ``exp_name``, which is typically a set of identical experiments that only vary in random seed. But if you'd like to see all of those curves separately, use the ``--count`` flag.
-
-.. option:: -s, --smooth=S, default=1
-
- *int*. Smooth data by averaging it over a fixed window. This parameter says how wide the averaging window will be.
-
-.. option:: --select=[SEL ...]
-
- *strings*. Optional selection rule: the plotter will only show curves from logdirs that contain all of these substrings.
-
-.. option:: --exclude=[EXC ...]
-
- *strings*. Optional exclusion rule: plotter will only show curves from logdirs that do not contain these substrings.
diff --git a/docs/_build/html/_sources/user/running.rst.txt b/docs/_build/html/_sources/user/running.rst.txt
deleted file mode 100644
index 23df1cd7d..000000000
--- a/docs/_build/html/_sources/user/running.rst.txt
+++ /dev/null
@@ -1,295 +0,0 @@
-===================
-Running Experiments
-===================
-
-
-.. contents:: Table of Contents
-
-One of the best ways to get a feel for deep RL is to run the algorithms and see how they perform on different tasks. The Spinning Up code library makes small-scale (local) experiments easy to do, and in this section, we'll discuss two ways to run them: either from the command line, or through function calls in scripts.
-
-
-Launching from the Command Line
-===============================
-
-
-Spinning Up ships with ``spinup/run.py``, a convenient tool that lets you easily launch any algorithm (with any choices of hyperparameters) from the command line. It also serves as a thin wrapper over the utilities for watching trained policies and plotting, although we will not discuss that functionality on this page (for those details, see the pages on `experiment outputs`_ and `plotting`_).
-
-The standard way to run a Spinning Up algorithm from the command line is
-
-.. parsed-literal::
-
- python -m spinup.run [algo name] [experiment flags]
-
-eg:
-
-.. parsed-literal::
-
- python -m spinup.run ppo --env Walker2d-v2 --exp_name walker
-
-.. _`experiment outputs`: ../user/saving_and_loading.html
-.. _`plotting`: ../user/plotting.html
-
-.. admonition:: You Should Know
-
- If you are using ZShell: ZShell interprets square brackets as special characters. Spinning Up uses square brackets in a few ways for command line arguments; make sure to escape them, or try the solution recommended `here `_ if you want to escape them by default.
-
-.. admonition:: Detailed Quickstart Guide
-
- .. parsed-literal::
-
- python -m spinup.run ppo --exp_name ppo_ant --env Ant-v2 --clip_ratio 0.1 0.2
- --hid[h] [32,32] [64,32] --act tf.nn.tanh --seed 0 10 20 --dt
- --data_dir path/to/data
-
- runs PPO in the ``Ant-v2`` Gym environment, with various settings controlled by the flags.
-
- ``clip_ratio``, ``hid``, and ``act`` are flags to set some algorithm hyperparameters. You can provide multiple values for hyperparameters to run multiple experiments. Check the docs to see what hyperparameters you can set (click here for the `PPO documentation`_).
-
- ``hid`` and ``act`` are `special shortcut flags`_ for setting the hidden sizes and activation function for the neural networks trained by the algorithm.
-
- The ``seed`` flag sets the seed for the random number generator. RL algorithms have high variance, so try multiple seeds to get a feel for how performance varies.
-
- The ``dt`` flag ensures that the save directory names will have timestamps in them (otherwise they don't, unless you set ``FORCE_DATESTAMP=True`` in ``spinup/user_config.py``).
-
- The ``data_dir`` flag allows you to set the save folder for results. The default value is set by ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``, which will be a subfolder ``data`` in the ``spinningup`` folder (unless you change it).
-
- `Save directory names`_ are based on ``exp_name`` and any flags which have multiple values. Instead of the full flag, a shorthand will appear in the directory name. Shorthands can be provided by the user in square brackets after the flag, like ``--hid[h]``; otherwise, shorthands are substrings of the flag (``clip_ratio`` becomes ``cli``). To illustrate, the save directory for the run with ``clip_ratio=0.1``, ``hid=[32,32]``, and ``seed=10`` will be:
-
- .. parsed-literal::
-
- path/to/data/YY-MM-DD_ppo_ant_cli0-1_h32-32/YY-MM-DD_HH-MM-SS-ppo_ant_cli0-1_h32-32_seed10
-
-.. _`PPO documentation`: ../algorithms/ppo.html#spinup.ppo
-.. _`special shortcut flags`: ../user/running.html#shortcut-flags
-.. _`Save directory names`: ../user/running.html#where-results-are-saved
-
-Setting Hyperparameters from the Command Line
----------------------------------------------
-
-Every hyperparameter in every algorithm can be controlled directly from the command line. If ``kwarg`` is a valid keyword arg for the function call of an algorithm, you can set values for it with the flag ``--kwarg``. To find out what keyword args are available, see either the docs page for an algorithm, or try
-
-.. parsed-literal::
-
- python -m spinup.run [algo name] --help
-
-to see a readout of the docstring.
-
-.. admonition:: You Should Know
-
- Values pass through ``eval()`` before being used, so you can describe some functions and objects directly from the command line. For example:
-
- .. parsed-literal::
-
- python -m spinup.run ppo --env Walker2d-v2 --exp_name walker --act tf.nn.elu
-
- sets ``tf.nn.elu`` as the activation function.
-
-.. admonition:: You Should Know
-
- There's some nice handling for kwargs that take dict values. Instead of having to provide
-
- .. parsed-literal::
-
- --key dict(v1=value_1, v2=value_2)
-
- you can give
-
- .. parsed-literal::
-
- --key:v1 value_1 --key:v2 value_2
-
- to get the same result.
-
-Launching Multiple Experiments at Once
---------------------------------------
-
-You can launch multiple experiments, to be executed **in series**, by simply providing more than one value for a given argument. (An experiment for each possible combination of values will be launched.)
-
-For example, to launch otherwise-equivalent runs with different random seeds (0, 10, and 20), do:
-
-.. parsed-literal::
-
- python -m spinup.run ppo --env Walker2d-v2 --exp_name walker --seed 0 10 20
-
-Experiments don't launch in parallel because they soak up enough resources that executing several at the same time wouldn't get a speedup.
-
-
-
-Special Flags
--------------
-
-A few flags receive special treatment.
-
-
-Environment Flag
-^^^^^^^^^^^^^^^^
-
-.. option:: --env, --env_name
-
- *string*. The name of an environment in the OpenAI Gym. All Spinning Up algorithms are implemented as functions that accept ``env_fn`` as an argument, where ``env_fn`` must be a callable function that builds a copy of the RL environment. Since the most common use case is Gym environments, though, all of which are built through ``gym.make(env_name)``, we allow you to just specify ``env_name`` (or ``env`` for short) at the command line, which gets converted to a lambda-function that builds the correct gym environment.
-
-
-Shortcut Flags
-^^^^^^^^^^^^^^
-
-Some algorithm arguments are relatively long, and we enabled shortcuts for them:
-
-.. option:: --hid, --ac_kwargs:hidden_sizes
-
- *list of ints*. Sets the sizes of the hidden layers in the neural networks (policies and value functions).
-
-.. option:: --act, --ac_kwargs:activation
-
- *tf op*. The activation function for the neural networks in the actor and critic.
-
-These flags are valid for all current Spinning Up algorithms.
-
-Config Flags
-^^^^^^^^^^^^
-
-These flags are not hyperparameters of any algorithm, but change the experimental configuration in some way.
-
-.. option:: --cpu, --num_cpu
-
- *int*. If this flag is set, the experiment is launched with this many processes, one per cpu, connected by MPI. Some algorithms are amenable to this sort of parallelization but not all. An error will be raised if you try setting ``num_cpu`` > 1 for an incompatible algorithm. You can also set ``--num_cpu auto``, which will automatically use as many CPUs as are available on the machine.
-
-.. option:: --exp_name
-
- *string*. The experiment name. This is used in naming the save directory for each experiment. The default is "cmd" + [algo name].
-
-.. option:: --data_dir
-
- *path*. Set the base save directory for this experiment or set of experiments. If none is given, the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py`` will be used.
-
-.. option:: --datestamp
-
- *bool*. Include date and time in the name for the save directory of the experiment.
-
-
-Where Results are Saved
------------------------
-
-Results for a particular experiment (a single run of a configuration of hyperparameters) are stored in
-
-::
-
- data_dir/[outer_prefix]exp_name[suffix]/[inner_prefix]exp_name[suffix]_s[seed]
-
-where
-
-* ``data_dir`` is the value of the ``--data_dir`` flag (defaults to ``DEFAULT_DATA_DIR`` from ``spinup/user_config.py`` if ``--data_dir`` is not given),
-* the ``outer_prefix`` is a ``YY-MM-DD_`` timestamp if the ``--datestamp`` flag is raised, otherwise nothing,
-* the ``inner_prefix`` is a ``YY-MM-DD_HH-MM-SS-`` timestamp if the ``--datestamp`` flag is raised, otherwise nothing,
-* and ``suffix`` is a special string based on the experiment hyperparameters.
-
-How is Suffix Determined?
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Suffixes are only included if you run multiple experiments at once, and they only include references to hyperparameters that differ across experiments, except for random seed. The goal is to make sure that results for similar experiments (ones which share all params except seed) are grouped in the same folder.
-
-Suffixes are constructed by combining *shorthands* for hyperparameters with their values, where a shorthand is either 1) constructed automatically from the hyperparameter name or 2) supplied by the user. The user can supply a shorthand by writing in square brackets after the kwarg flag.
-
-For example, consider:
-
-.. parsed-literal::
-
- python -m spinup.run ddpg --env Hopper-v2 --hid[h] [300] [128,128] --act tf.nn.tanh tf.nn.relu
-
-Here, the ``--hid`` flag is given a **user-supplied shorthand**, ``h``. The ``--act`` flag is not given a shorthand by the user, so one will be constructed for it automatically.
-
-The suffixes produced in this case are:
-
-.. parsed-literal::
- _h128-128_ac-actrelu
- _h128-128_ac-acttanh
- _h300_ac-actrelu
- _h300_ac-acttanh
-
-Note that the ``h`` was given by the user. the ``ac-act`` shorthand was constructed from ``ac_kwargs:activation`` (the true name for the ``act`` flag).
-
-
-Extra
------
-
-.. admonition:: You Don't Actually Need to Know This One
-
- Each individual algorithm is located in a file ``spinup/algos/ALGO_NAME/ALGO_NAME.py``, and these files can be run directly from the command line with a limited set of arguments (some of which differ from what's available to ``spinup/run.py``). The command line support in the individual algorithm files is essentially vestigial, however, and this is **not** a recommended way to perform experiments.
-
- This documentation page will not describe those command line calls, and will *only* describe calls through ``spinup/run.py``.
-
-Launching from Scripts
-======================
-
-Each algorithm is implemented as a python function, which can be imported directly from the ``spinup`` package, eg
-
->>> from spinup import ppo
-
-See the documentation page for each algorithm for a complete account of possible arguments. These methods can be used to set up specialized custom experiments, for example:
-
-.. code-block:: python
-
- from spinup import ppo
- import tensorflow as tf
- import gym
-
- env_fn = lambda : gym.make('LunarLander-v2')
-
- ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu)
-
- logger_kwargs = dict(output_dir='path/to/output_dir', exp_name='experiment_name')
-
- ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=250, logger_kwargs=logger_kwargs)
-
-
-Using ExperimentGrid
---------------------
-
-It's often useful in machine learning research to run the same algorithm with many possible hyperparameters. Spinning Up ships with a simple tool for facilitating this, called `ExperimentGrid`_.
-
-
-Consider the example in ``spinup/examples/bench_ppo_cartpole.py``:
-
-.. code-block:: python
- :linenos:
-
- from spinup.utils.run_utils import ExperimentGrid
- from spinup import ppo
- import tensorflow as tf
-
- if __name__ == '__main__':
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument('--cpu', type=int, default=4)
- parser.add_argument('--num_runs', type=int, default=3)
- args = parser.parse_args()
-
- eg = ExperimentGrid(name='ppo-bench')
- eg.add('env_name', 'CartPole-v0', '', True)
- eg.add('seed', [10*i for i in range(args.num_runs)])
- eg.add('epochs', 10)
- eg.add('steps_per_epoch', 4000)
- eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid')
- eg.add('ac_kwargs:activation', [tf.tanh, tf.nn.relu], '')
- eg.run(ppo, num_cpu=args.cpu)
-
-After making the ExperimentGrid object, parameters are added to it with
-
-.. parsed-literal::
-
- eg.add(param_name, values, shorthand, in_name)
-
-where ``in_name`` forces a parameter to appear in the experiment name, even if it has the same value across all experiments.
-
-After all parameters have been added,
-
-.. parsed-literal::
-
- eg.run(thunk, **run_kwargs)
-
-runs all experiments in the grid (one experiment per valid configuration), by providing the configurations as kwargs to the function ``thunk``. ``ExperimentGrid.run`` uses a function named `call_experiment`_ to launch ``thunk``, and ``**run_kwargs`` specify behaviors for ``call_experiment``. See `the documentation page`_ for details.
-
-Except for the absence of shortcut kwargs (you can't use ``hid`` for ``ac_kwargs:hidden_sizes`` in ``ExperimentGrid``), the basic behavior of ``ExperimentGrid`` is the same as running things from the command line. (In fact, ``spinup.run`` uses an ``ExperimentGrid`` under the hood.)
-
-.. _`ExperimentGrid`: ../utils/run_utils.html#experimentgrid
-.. _`the documentation page`: ../utils/run_utils.html#experimentgrid
-.. _`call_experiment`: ../utils/run_utils.html#spinup.utils.run_utils.call_experiment
\ No newline at end of file
diff --git a/docs/_build/html/_sources/user/saving_and_loading.rst.txt b/docs/_build/html/_sources/user/saving_and_loading.rst.txt
deleted file mode 100644
index 3397a2a21..000000000
--- a/docs/_build/html/_sources/user/saving_and_loading.rst.txt
+++ /dev/null
@@ -1,190 +0,0 @@
-==================
-Experiment Outputs
-==================
-
-.. contents:: Table of Contents
-
-In this section we'll cover
-
-- what outputs come from Spinning Up algorithm implementations,
-- what formats they're stored in and how they're organized,
-- where they are stored and how you can change that,
-- and how to load and run trained policies.
-
-.. admonition:: You Should Know
-
- Spinning Up implementations currently have no way to resume training for partially-trained agents. If you consider this feature important, please let us know---or consider it a hacking project!
-
-Algorithm Outputs
-=================
-
-Each algorithm is set up to save a training run's hyperparameter configuration, learning progress, trained agent and value functions, and a copy of the environment if possible (to make it easy to load up the agent and environment simultaneously). The output directory contains the following:
-
-+--------------------------------------------------------------------------------+
-| **Output Directory Structure** |
-+----------------+---------------------------------------------------------------+
-|``simple_save/``| | A directory containing everything needed to restore the |
-| | | trained agent and value functions. (`Details below.`_) |
-+----------------+---------------------------------------------------------------+
-|``config.json`` | | A dict containing an as-complete-as-possible description |
-| | | of the args and kwargs you used to launch the training |
-| | | function. If you passed in something which can't be |
-| | | serialized to JSON, it should get handled gracefully by the |
-| | | logger, and the config file will represent it with a string.|
-| | | Note: this is meant for record-keeping only. Launching an |
-| | | experiment from a config file is not currently supported. |
-+----------------+---------------------------------------------------------------+
-|``progress.txt``| | A tab-separated value file containing records of the metrics|
-| | | recorded by the logger throughout training. eg, ``Epoch``, |
-| | | ``AverageEpRet``, etc. |
-+----------------+---------------------------------------------------------------+
-|``vars.pkl`` | | A pickle file containing anything about the algorithm state |
-| | | which should get stored. Currently, all algorithms only use |
-| | | this to save a copy of the environment. |
-+----------------+---------------------------------------------------------------+
-
-.. admonition:: You Should Know
-
- Sometimes environment-saving fails because the environment can't be pickled, and ``vars.pkl`` is empty. This is known to be a problem for Gym Box2D environments in older versions of Gym, which can't be saved in this manner.
-
-.. _`Details below.`:
-
-The ``simple_save`` directory contains:
-
-+----------------------------------------------------------------------------------+
-| **Simple_Save Directory Structure** |
-+------------------+---------------------------------------------------------------+
-|``variables/`` | | A directory containing outputs from the Tensorflow Saver. |
-| | | See documentation for `Tensorflow SavedModel`_. |
-+------------------+---------------------------------------------------------------+
-|``model_info.pkl``| | A dict containing information (map from key to tensor name) |
-| | | which helps us unpack the saved model after loading. |
-+------------------+---------------------------------------------------------------+
-|``saved_model.pb``| | A protocol buffer, needed for a `Tensorflow SavedModel`_. |
-+------------------+---------------------------------------------------------------+
-
-.. admonition:: You Should Know
-
- The only file in here that you should ever have to use "by hand" is the ``config.json`` file. Our agent testing utility will load things from the ``simple_save/`` directory and ``vars.pkl`` file, and our plotter interprets the contents of ``progress.txt``, and those are the correct tools for interfacing with these outputs. But there is no tooling for ``config.json``---it's just there so that if you forget what hyperparameters you ran an experiment with, you can double-check.
-
-
-.. _`Tensorflow SavedModel`: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md
-
-
-Save Directory Location
-=======================
-
-Experiment results will, by default, be saved in the same directory as the Spinning Up package, in a folder called ``data``:
-
-.. parsed-literal::
-
- spinningup/
- **data/**
- ...
- docs/
- ...
- spinup/
- ...
- LICENSE
- setup.py
-
-You can change the default results directory by modifying ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
-
-
-Loading and Running Trained Policies
-====================================
-
-
-If Environment Saves Successfully
----------------------------------
-
-For cases where the environment is successfully saved alongside the agent, it's a cinch to watch the trained agent act in the environment using:
-
-
-.. parsed-literal::
-
- python -m spinup.run test_policy path/to/output_directory
-
-
-There are a few flags for options:
-
-
-.. option:: -l L, --len=L, default=0
-
- *int*. Maximum length of test episode / trajectory / rollout. The default of 0 means no maximum episode length---episodes only end when the agent has reached a terminal state in the environment. (Note: setting L=0 will not prevent Gym envs wrapped by TimeLimit wrappers from ending when they reach their pre-set maximum episode length.)
-
-.. option:: -n N, --episodes=N, default=100
-
- *int*. Number of test episodes to run the agent for.
-
-.. option:: -nr, --norender
-
- Do not render the test episodes to the screen. In this case, ``test_policy`` will only print the episode returns and lengths. (Use case: the renderer slows down the testing process, and you just want to get a fast sense of how the agent is performing, so you don't particularly care to watch it.)
-
-.. option:: -i I, --itr=I, default=-1
-
- *int*. This is an option for a special case which is not supported by algorithms in this package as-shipped, but which they are easily modified to do. Use case: Sometimes it's nice to watch trained agents from many different points in training (eg watch at iteration 50, 100, 150, etc.). The logger can do this---save snapshots of the agent from those different points, so they can be run and watched later. In this case, you use this flag to specify which iteration to run. But again: spinup algorithms by default only save snapshots of the most recent agent, overwriting the old snapshots.
-
- The default value of this flag means "use the latest snapshot."
-
- To modify an algo so it does produce multiple snapshots, find the following lines (which are present in all of the algorithms):
-
- .. code-block:: python
-
- if (epoch % save_freq == 0) or (epoch == epochs-1):
- logger.save_state({'env': env}, None)
-
- and tweak them to
-
- .. code-block:: python
-
- if (epoch % save_freq == 0) or (epoch == epochs-1):
- logger.save_state({'env': env}, epoch)
-
- Make sure to then also set ``save_freq`` to something reasonable (because if it defaults to 1, for instance, you'll flood your output directory with one ``simple_save`` folder for each snapshot---which adds up fast).
-
-
-.. option:: -d, --deterministic
-
- Another special case, which is only used for SAC. The Spinning Up SAC implementation trains a stochastic policy, but is evaluated using the deterministic *mean* of the action distribution. ``test_policy`` will default to using the stochastic policy trained by SAC, but you should set the deterministic flag to watch the deterministic mean policy (the correct evaluation policy for SAC). This flag is not used for any other algorithms.
-
-
-
-Environment Not Found Error
----------------------------
-
-If the environment wasn't saved successfully, you can expect ``test_policy.py`` to crash with
-
-.. parsed-literal::
-
- Traceback (most recent call last):
- File "spinup/utils/test_policy.py", line 88, in
- run_policy(env, get_action, args.len, args.episodes, not(args.norender))
- File "spinup/utils/test_policy.py", line 50, in run_policy
- "page on Experiment Outputs for how to handle this situation."
- AssertionError: Environment not found!
-
- It looks like the environment wasn't saved, and we can't run the agent in it. :(
-
- Check out the readthedocs page on Experiment Outputs for how to handle this situation.
-
-
-In this case, watching your agent perform is slightly more of a pain but not impossible, as long as you can recreate your environment easily. Try the following in IPython:
-
->>> from spinup.utils.test_policy import load_policy, run_policy
->>> import your_env
->>> _, get_action = load_policy('/path/to/output_directory')
->>> env = your_env.make()
->>> run_policy(env, get_action)
-Logging data to /tmp/experiments/1536150702/progress.txt
-Episode 0 EpRet -163.830 EpLen 93
-Episode 1 EpRet -346.164 EpLen 99
-...
-
-
-Using Trained Value Functions
------------------------------
-
-The ``test_policy.py`` tool doesn't help you look at trained value functions, and if you want to use those, you will have to do some digging by hand. Check the documentation for the `restore_tf_graph`_ function for details on how.
-
-.. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
\ No newline at end of file
diff --git a/docs/_build/html/_sources/utils/logger.rst.txt b/docs/_build/html/_sources/utils/logger.rst.txt
deleted file mode 100644
index 2c338bc4f..000000000
--- a/docs/_build/html/_sources/utils/logger.rst.txt
+++ /dev/null
@@ -1,169 +0,0 @@
-======
-Logger
-======
-
-.. contents:: Table of Contents
-
-Using a Logger
-==============
-
-Spinning Up ships with basic logging tools, implemented in the classes `Logger`_ and `EpochLogger`_. The Logger class contains most of the basic functionality for saving diagnostics, hyperparameter configurations, the state of a training run, and the trained model. The EpochLogger class adds a thin layer on top of that to make it easy to track the average, standard deviation, min, and max value of a diagnostic over each epoch and across MPI workers.
-
-.. admonition:: You Should Know
-
- All Spinning Up algorithm implementations use an EpochLogger.
-
-.. _`Logger`: ../utils/logger.html#spinup.utils.logx.Logger
-.. _`EpochLogger`: ../utils/logger.html#spinup.utils.logx.EpochLogger
-
-Examples
---------
-
-First, let's look at a simple example of how an EpochLogger keeps track of a diagnostic value:
-
->>> from spinup.utils.logx import EpochLogger
->>> epoch_logger = EpochLogger()
->>> for i in range(10):
- epoch_logger.store(Test=i)
->>> epoch_logger.log_tabular('Test', with_min_and_max=True)
->>> epoch_logger.dump_tabular()
--------------------------------------
-| AverageTest | 4.5 |
-| StdTest | 2.87 |
-| MaxTest | 9 |
-| MinTest | 0 |
--------------------------------------
-
-The ``store`` method is used to save all values of ``Test`` to the ``epoch_logger``'s internal state. Then, when ``log_tabular`` is called, it computes the average, standard deviation, min, and max of ``Test`` over all of the values in the internal state. The internal state is wiped clean after the call to ``log_tabular`` (to prevent leakage into the statistics at the next epoch). Finally, ``dump_tabular`` is called to write the diagnostics to file and to stdout.
-
-Next, let's look at a full training procedure with the logger embedded, to highlight configuration and model saving as well as diagnostic logging:
-
-.. code-block:: python
- :linenos:
- :emphasize-lines: 18, 19, 42, 43, 54, 58, 61, 62, 63, 64, 65, 66
-
- import numpy as np
- import tensorflow as tf
- import time
- from spinup.utils.logx import EpochLogger
-
-
- def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
- for h in hidden_sizes[:-1]:
- x = tf.layers.dense(x, units=h, activation=activation)
- return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
-
-
- # Simple script for training an MLP on MNIST.
- def train_mnist(steps_per_epoch=100, epochs=5,
- lr=1e-3, layers=2, hidden_size=64,
- logger_kwargs=dict(), save_freq=1):
-
- logger = EpochLogger(**logger_kwargs)
- logger.save_config(locals())
-
- # Load and preprocess MNIST data
- (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
- x_train = x_train.reshape(-1, 28*28) / 255.0
-
- # Define inputs & main outputs from computation graph
- x_ph = tf.placeholder(tf.float32, shape=(None, 28*28))
- y_ph = tf.placeholder(tf.int32, shape=(None,))
- logits = mlp(x_ph, hidden_sizes=[hidden_size]*layers + [10], activation=tf.nn.relu)
- predict = tf.argmax(logits, axis=1, output_type=tf.int32)
-
- # Define loss function, accuracy, and training op
- y = tf.one_hot(y_ph, 10)
- loss = tf.losses.softmax_cross_entropy(y, logits)
- acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
- train_op = tf.train.AdamOptimizer().minimize(loss)
-
- # Prepare session
- sess = tf.Session()
- sess.run(tf.global_variables_initializer())
-
- # Setup model saving
- logger.setup_tf_saver(sess, inputs={'x': x_ph},
- outputs={'logits': logits, 'predict': predict})
-
- start_time = time.time()
-
- # Run main training loop
- for epoch in range(epochs):
- for t in range(steps_per_epoch):
- idxs = np.random.randint(0, len(x_train), 32)
- feed_dict = {x_ph: x_train[idxs],
- y_ph: y_train[idxs]}
- outs = sess.run([loss, acc, train_op], feed_dict=feed_dict)
- logger.store(Loss=outs[0], Acc=outs[1])
-
- # Save model
- if (epoch % save_freq == 0) or (epoch == epochs-1):
- logger.save_state(state_dict=dict(), itr=None)
-
- # Log info about epoch
- logger.log_tabular('Epoch', epoch)
- logger.log_tabular('Acc', with_min_and_max=True)
- logger.log_tabular('Loss', average_only=True)
- logger.log_tabular('TotalGradientSteps', (epoch+1)*steps_per_epoch)
- logger.log_tabular('Time', time.time()-start_time)
- logger.dump_tabular()
-
- if __name__ == '__main__':
- train_mnist()
-
-In this example, observe that
-
-* On line 19, `logger.save_config`_ is used to save the hyperparameter configuration to a JSON file.
-* On lines 42 and 43, `logger.setup_tf_saver`_ is used to prepare the logger to save the key elements of the computation graph.
-* On line 54, diagnostics are saved to the logger's internal state via `logger.store`_.
-* On line 58, the computation graph is saved once per epoch via `logger.save_state`_.
-* On lines 61-66, `logger.log_tabular`_ and `logger.dump_tabular`_ are used to write the epoch diagnostics to file. Note that the keys passed into `logger.log_tabular`_ are the same as the keys passed into `logger.store`_.
-
-.. _`logger.save_config`: ../utils/logger.html#spinup.utils.logx.Logger.save_config
-.. _`logger.setup_tf_saver`: ../utils/logger.html#spinup.utils.logx.Logger.setup_tf_saver
-.. _`logger.store`: ../utils/logger.html#spinup.utils.logx.EpochLogger.store
-.. _`logger.save_state`: ../utils/logger.html#spinup.utils.logx.Logger.save_state
-.. _`logger.log_tabular`: ../utils/logger.html#spinup.utils.logx.EpochLogger.log_tabular
-.. _`logger.dump_tabular`: ../utils/logger.html#spinup.utils.logx.Logger.dump_tabular
-
-
-Logging and MPI
----------------
-
-.. admonition:: You Should Know
-
- Several algorithms in RL are easily parallelized by using MPI to average gradients and/or other key quantities. The Spinning Up loggers are designed to be well-behaved when using MPI: things will only get written to stdout and to file from the process with rank 0. But information from other processes isn't lost if you use the EpochLogger: everything which is passed into EpochLogger via ``store``, regardless of which process it's stored in, gets used to compute average/std/min/max values for a diagnostic.
-
-
-Logger Classes
-==============
-
-
-.. autoclass:: spinup.utils.logx.Logger
- :members:
-
- .. automethod:: spinup.utils.logx.Logger.__init__
-
-.. autoclass:: spinup.utils.logx.EpochLogger
- :show-inheritance:
- :members:
-
-
-
-Loading Saved Graphs
-====================
-
-.. autofunction:: spinup.utils.logx.restore_tf_graph
-
-When you use this method to restore a graph saved by a Spinning Up implementation, you can minimally expect it to include the following:
-
-====== ===============================================
-Key Value
-====== ===============================================
-``x`` Tensorflow placeholder for state input.
-``pi`` | Samples an action from the agent, conditioned
- | on states in ``x``.
-====== ===============================================
-
-The relevant value functions for an algorithm are also typically stored. For details of what else gets saved by a given algorithm, see its documentation page.
\ No newline at end of file
diff --git a/docs/_build/html/_sources/utils/mpi.rst.txt b/docs/_build/html/_sources/utils/mpi.rst.txt
deleted file mode 100644
index 4a1a5b424..000000000
--- a/docs/_build/html/_sources/utils/mpi.rst.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-=========
-MPI Tools
-=========
-
-.. contents:: Table of Contents
-
-Core MPI Utilities
-==================
-
-.. automodule:: spinup.utils.mpi_tools
- :members:
-
-
-MPI + Tensorflow Utilities
-==========================
-
-The ``spinup.utils.mpi_tf`` contains a a few tools to make it easy to use the AdamOptimizer across many MPI processes. This is a bit hacky---if you're looking for something more sophisticated and general-purpose, consider `horovod`_.
-
-.. _`horovod`: https://github.com/uber/horovod
-
-.. automodule:: spinup.utils.mpi_tf
- :members:
\ No newline at end of file
diff --git a/docs/_build/html/_sources/utils/plotter.rst.txt b/docs/_build/html/_sources/utils/plotter.rst.txt
deleted file mode 100644
index b6e3af271..000000000
--- a/docs/_build/html/_sources/utils/plotter.rst.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-=======
-Plotter
-=======
-
-See the page on `plotting results`_ for documentation of the plotter.
-
-.. _`plotting results`: ../user/plotting.html
\ No newline at end of file
diff --git a/docs/_build/html/_sources/utils/run_utils.rst.txt b/docs/_build/html/_sources/utils/run_utils.rst.txt
deleted file mode 100644
index 4345fe07f..000000000
--- a/docs/_build/html/_sources/utils/run_utils.rst.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-=========
-Run Utils
-=========
-
-.. contents:: Table of Contents
-
-ExperimentGrid
-==============
-
-Spinning Up ships with a tool called ExperimentGrid for making hyperparameter ablations easier. This is based on (but simpler than) `the rllab tool`_ called VariantGenerator.
-
-.. _`the rllab tool`: https://github.com/rll/rllab/blob/master/rllab/misc/instrument.py#L173
-
-.. autoclass:: spinup.utils.run_utils.ExperimentGrid
- :members:
-
-
-Calling Experiments
-===================
-
-.. autofunction:: spinup.utils.run_utils.call_experiment
-
-.. autofunction:: spinup.utils.run_utils.setup_logger_kwargs
diff --git a/docs/_build/html/_static/ajax-loader.gif b/docs/_build/html/_static/ajax-loader.gif
deleted file mode 100644
index 61faf8cab..000000000
Binary files a/docs/_build/html/_static/ajax-loader.gif and /dev/null differ
diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css
deleted file mode 100644
index dc88b5a2d..000000000
--- a/docs/_build/html/_static/basic.css
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * basic.css
- * ~~~~~~~~~
- *
- * Sphinx stylesheet -- basic theme.
- *
- * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
- */
-
-/* -- main layout ----------------------------------------------------------- */
-
-div.clearer {
- clear: both;
-}
-
-/* -- relbar ---------------------------------------------------------------- */
-
-div.related {
- width: 100%;
- font-size: 90%;
-}
-
-div.related h3 {
- display: none;
-}
-
-div.related ul {
- margin: 0;
- padding: 0 0 0 10px;
- list-style: none;
-}
-
-div.related li {
- display: inline;
-}
-
-div.related li.right {
- float: right;
- margin-right: 5px;
-}
-
-/* -- sidebar --------------------------------------------------------------- */
-
-div.sphinxsidebarwrapper {
- padding: 10px 5px 0 10px;
-}
-
-div.sphinxsidebar {
- float: left;
- width: 230px;
- margin-left: -100%;
- font-size: 90%;
- word-wrap: break-word;
- overflow-wrap : break-word;
-}
-
-div.sphinxsidebar ul {
- list-style: none;
-}
-
-div.sphinxsidebar ul ul,
-div.sphinxsidebar ul.want-points {
- margin-left: 20px;
- list-style: square;
-}
-
-div.sphinxsidebar ul ul {
- margin-top: 0;
- margin-bottom: 0;
-}
-
-div.sphinxsidebar form {
- margin-top: 10px;
-}
-
-div.sphinxsidebar input {
- border: 1px solid #98dbcc;
- font-family: sans-serif;
- font-size: 1em;
-}
-
-div.sphinxsidebar #searchbox input[type="text"] {
- width: 170px;
-}
-
-img {
- border: 0;
- max-width: 100%;
-}
-
-/* -- search page ----------------------------------------------------------- */
-
-ul.search {
- margin: 10px 0 0 20px;
- padding: 0;
-}
-
-ul.search li {
- padding: 5px 0 5px 20px;
- background-image: url(file.png);
- background-repeat: no-repeat;
- background-position: 0 7px;
-}
-
-ul.search li a {
- font-weight: bold;
-}
-
-ul.search li div.context {
- color: #888;
- margin: 2px 0 0 30px;
- text-align: left;
-}
-
-ul.keywordmatches li.goodmatch a {
- font-weight: bold;
-}
-
-/* -- index page ------------------------------------------------------------ */
-
-table.contentstable {
- width: 90%;
- margin-left: auto;
- margin-right: auto;
-}
-
-table.contentstable p.biglink {
- line-height: 150%;
-}
-
-a.biglink {
- font-size: 1.3em;
-}
-
-span.linkdescr {
- font-style: italic;
- padding-top: 5px;
- font-size: 90%;
-}
-
-/* -- general index --------------------------------------------------------- */
-
-table.indextable {
- width: 100%;
-}
-
-table.indextable td {
- text-align: left;
- vertical-align: top;
-}
-
-table.indextable ul {
- margin-top: 0;
- margin-bottom: 0;
- list-style-type: none;
-}
-
-table.indextable > tbody > tr > td > ul {
- padding-left: 0em;
-}
-
-table.indextable tr.pcap {
- height: 10px;
-}
-
-table.indextable tr.cap {
- margin-top: 10px;
- background-color: #f2f2f2;
-}
-
-img.toggler {
- margin-right: 3px;
- margin-top: 3px;
- cursor: pointer;
-}
-
-div.modindex-jumpbox {
- border-top: 1px solid #ddd;
- border-bottom: 1px solid #ddd;
- margin: 1em 0 1em 0;
- padding: 0.4em;
-}
-
-div.genindex-jumpbox {
- border-top: 1px solid #ddd;
- border-bottom: 1px solid #ddd;
- margin: 1em 0 1em 0;
- padding: 0.4em;
-}
-
-/* -- domain module index --------------------------------------------------- */
-
-table.modindextable td {
- padding: 2px;
- border-collapse: collapse;
-}
-
-/* -- general body styles --------------------------------------------------- */
-
-div.body p, div.body dd, div.body li, div.body blockquote {
- -moz-hyphens: auto;
- -ms-hyphens: auto;
- -webkit-hyphens: auto;
- hyphens: auto;
-}
-
-a.headerlink {
- visibility: hidden;
-}
-
-h1:hover > a.headerlink,
-h2:hover > a.headerlink,
-h3:hover > a.headerlink,
-h4:hover > a.headerlink,
-h5:hover > a.headerlink,
-h6:hover > a.headerlink,
-dt:hover > a.headerlink,
-caption:hover > a.headerlink,
-p.caption:hover > a.headerlink,
-div.code-block-caption:hover > a.headerlink {
- visibility: visible;
-}
-
-div.body p.caption {
- text-align: inherit;
-}
-
-div.body td {
- text-align: left;
-}
-
-.first {
- margin-top: 0 !important;
-}
-
-p.rubric {
- margin-top: 30px;
- font-weight: bold;
-}
-
-img.align-left, .figure.align-left, object.align-left {
- clear: left;
- float: left;
- margin-right: 1em;
-}
-
-img.align-right, .figure.align-right, object.align-right {
- clear: right;
- float: right;
- margin-left: 1em;
-}
-
-img.align-center, .figure.align-center, object.align-center {
- display: block;
- margin-left: auto;
- margin-right: auto;
-}
-
-.align-left {
- text-align: left;
-}
-
-.align-center {
- text-align: center;
-}
-
-.align-right {
- text-align: right;
-}
-
-/* -- sidebars -------------------------------------------------------------- */
-
-div.sidebar {
- margin: 0 0 0.5em 1em;
- border: 1px solid #ddb;
- padding: 7px 7px 0 7px;
- background-color: #ffe;
- width: 40%;
- float: right;
-}
-
-p.sidebar-title {
- font-weight: bold;
-}
-
-/* -- topics ---------------------------------------------------------------- */
-
-div.topic {
- border: 1px solid #ccc;
- padding: 7px 7px 0 7px;
- margin: 10px 0 10px 0;
-}
-
-p.topic-title {
- font-size: 1.1em;
- font-weight: bold;
- margin-top: 10px;
-}
-
-/* -- admonitions ----------------------------------------------------------- */
-
-div.admonition {
- margin-top: 10px;
- margin-bottom: 10px;
- padding: 7px;
-}
-
-div.admonition dt {
- font-weight: bold;
-}
-
-div.admonition dl {
- margin-bottom: 0;
-}
-
-p.admonition-title {
- margin: 0px 10px 5px 0px;
- font-weight: bold;
-}
-
-div.body p.centered {
- text-align: center;
- margin-top: 25px;
-}
-
-/* -- tables ---------------------------------------------------------------- */
-
-table.docutils {
- border: 0;
- border-collapse: collapse;
-}
-
-table caption span.caption-number {
- font-style: italic;
-}
-
-table caption span.caption-text {
-}
-
-table.docutils td, table.docutils th {
- padding: 1px 8px 1px 5px;
- border-top: 0;
- border-left: 0;
- border-right: 0;
- border-bottom: 1px solid #aaa;
-}
-
-table.footnote td, table.footnote th {
- border: 0 !important;
-}
-
-th {
- text-align: left;
- padding-right: 5px;
-}
-
-table.citation {
- border-left: solid 1px gray;
- margin-left: 1px;
-}
-
-table.citation td {
- border-bottom: none;
-}
-
-/* -- figures --------------------------------------------------------------- */
-
-div.figure {
- margin: 0.5em;
- padding: 0.5em;
-}
-
-div.figure p.caption {
- padding: 0.3em;
-}
-
-div.figure p.caption span.caption-number {
- font-style: italic;
-}
-
-div.figure p.caption span.caption-text {
-}
-
-/* -- field list styles ----------------------------------------------------- */
-
-table.field-list td, table.field-list th {
- border: 0 !important;
-}
-
-.field-list ul {
- margin: 0;
- padding-left: 1em;
-}
-
-.field-list p {
- margin: 0;
-}
-
-/* -- other body styles ----------------------------------------------------- */
-
-ol.arabic {
- list-style: decimal;
-}
-
-ol.loweralpha {
- list-style: lower-alpha;
-}
-
-ol.upperalpha {
- list-style: upper-alpha;
-}
-
-ol.lowerroman {
- list-style: lower-roman;
-}
-
-ol.upperroman {
- list-style: upper-roman;
-}
-
-dl {
- margin-bottom: 15px;
-}
-
-dd p {
- margin-top: 0px;
-}
-
-dd ul, dd table {
- margin-bottom: 10px;
-}
-
-dd {
- margin-top: 3px;
- margin-bottom: 10px;
- margin-left: 30px;
-}
-
-dt:target, .highlighted {
- background-color: #fbe54e;
-}
-
-dl.glossary dt {
- font-weight: bold;
- font-size: 1.1em;
-}
-
-.optional {
- font-size: 1.3em;
-}
-
-.sig-paren {
- font-size: larger;
-}
-
-.versionmodified {
- font-style: italic;
-}
-
-.system-message {
- background-color: #fda;
- padding: 5px;
- border: 3px solid red;
-}
-
-.footnote:target {
- background-color: #ffa;
-}
-
-.line-block {
- display: block;
- margin-top: 1em;
- margin-bottom: 1em;
-}
-
-.line-block .line-block {
- margin-top: 0;
- margin-bottom: 0;
- margin-left: 1.5em;
-}
-
-.guilabel, .menuselection {
- font-family: sans-serif;
-}
-
-.accelerator {
- text-decoration: underline;
-}
-
-.classifier {
- font-style: oblique;
-}
-
-abbr, acronym {
- border-bottom: dotted 1px;
- cursor: help;
-}
-
-/* -- code displays --------------------------------------------------------- */
-
-pre {
- overflow: auto;
- overflow-y: hidden; /* fixes display issues on Chrome browsers */
-}
-
-span.pre {
- -moz-hyphens: none;
- -ms-hyphens: none;
- -webkit-hyphens: none;
- hyphens: none;
-}
-
-td.linenos pre {
- padding: 5px 0px;
- border: 0;
- background-color: transparent;
- color: #aaa;
-}
-
-table.highlighttable {
- margin-left: 0.5em;
-}
-
-table.highlighttable td {
- padding: 0 0.5em 0 0.5em;
-}
-
-div.code-block-caption {
- padding: 2px 5px;
- font-size: small;
-}
-
-div.code-block-caption code {
- background-color: transparent;
-}
-
-div.code-block-caption + div > div.highlight > pre {
- margin-top: 0;
-}
-
-div.code-block-caption span.caption-number {
- padding: 0.1em 0.3em;
- font-style: italic;
-}
-
-div.code-block-caption span.caption-text {
-}
-
-div.literal-block-wrapper {
- padding: 1em 1em 0;
-}
-
-div.literal-block-wrapper div.highlight {
- margin: 0;
-}
-
-code.descname {
- background-color: transparent;
- font-weight: bold;
- font-size: 1.2em;
-}
-
-code.descclassname {
- background-color: transparent;
-}
-
-code.xref, a code {
- background-color: transparent;
- font-weight: bold;
-}
-
-h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
- background-color: transparent;
-}
-
-.viewcode-link {
- float: right;
-}
-
-.viewcode-back {
- float: right;
- font-family: sans-serif;
-}
-
-div.viewcode-block:target {
- margin: -1px -10px;
- padding: 0 10px;
-}
-
-/* -- math display ---------------------------------------------------------- */
-
-img.math {
- vertical-align: middle;
-}
-
-div.body div.math p {
- text-align: center;
-}
-
-span.eqno {
- float: right;
-}
-
-span.eqno a.headerlink {
- position: relative;
- left: 0px;
- z-index: 1;
-}
-
-div.math:hover a.headerlink {
- visibility: visible;
-}
-
-/* -- printout stylesheet --------------------------------------------------- */
-
-@media print {
- div.document,
- div.documentwrapper,
- div.bodywrapper {
- margin: 0 !important;
- width: 100%;
- }
-
- div.sphinxsidebar,
- div.related,
- div.footer,
- #top-link {
- display: none;
- }
-}
\ No newline at end of file
diff --git a/docs/_build/html/_static/comment-bright.png b/docs/_build/html/_static/comment-bright.png
deleted file mode 100644
index 15e27edb1..000000000
Binary files a/docs/_build/html/_static/comment-bright.png and /dev/null differ
diff --git a/docs/_build/html/_static/comment-close.png b/docs/_build/html/_static/comment-close.png
deleted file mode 100644
index 4d91bcf57..000000000
Binary files a/docs/_build/html/_static/comment-close.png and /dev/null differ
diff --git a/docs/_build/html/_static/comment.png b/docs/_build/html/_static/comment.png
deleted file mode 100644
index dfbc0cbd5..000000000
Binary files a/docs/_build/html/_static/comment.png and /dev/null differ
diff --git a/docs/_build/html/_static/css/badge_only.css b/docs/_build/html/_static/css/badge_only.css
deleted file mode 100644
index 323730ae2..000000000
--- a/docs/_build/html/_static/css/badge_only.css
+++ /dev/null
@@ -1 +0,0 @@
-.fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-weight:normal;font-style:normal;src:url("../fonts/fontawesome-webfont.eot");src:url("../fonts/fontawesome-webfont.eot?#iefix") format("embedded-opentype"),url("../fonts/fontawesome-webfont.woff") format("woff"),url("../fonts/fontawesome-webfont.ttf") format("truetype"),url("../fonts/fontawesome-webfont.svg#FontAwesome") format("svg")}.fa:before{display:inline-block;font-family:FontAwesome;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa{display:inline-block;text-decoration:inherit}li .fa{display:inline-block}li .fa-large:before,li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-0.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before,ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before{content:""}.icon-book:before{content:""}.fa-caret-down:before{content:""}.icon-caret-down:before{content:""}.fa-caret-up:before{content:""}.icon-caret-up:before{content:""}.fa-caret-left:before{content:""}.icon-caret-left:before{content:""}.fa-caret-right:before{content:""}.icon-caret-right:before{content:""}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980B9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27AE60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#E74C3C;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#F1C40F;color:#000}.rst-versions.shift-up{height:auto;max-height:100%}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}
diff --git a/docs/_build/html/_static/css/modify.css b/docs/_build/html/_static/css/modify.css
deleted file mode 100644
index 1b2bc293c..000000000
--- a/docs/_build/html/_static/css/modify.css
+++ /dev/null
@@ -1,166 +0,0 @@
-:root {
- /* Colors */
- --color--white: #fff;
- --color--lightwash: #f7fbfb;
- --color--mediumwash: #eff7f8;
- --color--darkwash: #e6f3f3;
- --color--warmgraylight: #eeedee;
- --color--warmgraydark: #a3acb0;
- --color--coolgray1: #c5c5d2;
- --color--coolgray2: #8e8ea0;
- --color--coolgray3: #6e6e80;
- --color--coolgray4: #404452;
- --color--black: #050505;
- --color--pink: #e6a2e4;
- --color--magenta: #dd5ce5;
- --color--red: #bd1c5f;
- --color--brightred: #ef4146;
- --color--orange: #e86c09;
- --color--golden: #f4ac36;
- --color--yellow: #ebe93d;
- --color--lightgreen: #68de7a;
- --color--darkgreen: #10a37f;
- --color--teal: #2ff3ce;
- --color--lightblue: #27b5ea;
- --color--mediumblue: #2e95d3;
- --color--darkblue: #5436da;
- --color--navyblue: #1d0d4c;
- --color--lightpurple: #6b40d8;
- --color--darkpurple: #412991;
- --color--lightgrayishpurple: #cdc3cf;
- --color--mediumgrayishpurple: #9c88a3;
- --color--darkgrayishpurple: #562f5f;
-}
-
-body {
- color: var(--color--darkgray) !important;
- fill: var(--color--darkgray) !important;
-}
-
-h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend {
- /* font-weight: 500;
- font-family: Colfax, sans-serif !important; */
- font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif !important;
-}
-
-.wy-nav-top {
- background-color: var(--color--coolgray4) !important;
-}
-
-.rst-content .toc-backref {
- color: #404040 !important;
-}
-
-.footnote {
- padding-left: 0.75rem;
- background-color: var(--color--warmgraylight) !important;
-}
-
-.wy-nav-top a, .wy-nav-top a:visited {
- color: var(--color--white) !important;
-}
-
-.wy-menu-vertical header, .wy-menu-vertical p.caption {
- font-weight: 500 !important;
- letter-spacing: 1px;
- margin-top: 1.25rem;
-}
-
-.wy-side-nav-search {
- background-color: var(--color--warmgraylight) !important;
-}
-
-.wy-body-for-nav {
- background-color: var(--color--coolgray1) !important;
-}
-
-.wy-menu-vertical li span.toctree-expand {
- color: var(--color--coolgray2) !important;
-}
-
-.wy-nav-side {
- color: var(--color--coolgray1) !important;
- background-color: var(--color--coolgray4) !important;
-}
-
-.wy-side-nav-search input[type=text] {
- border-color: var(--color--warmgraydark) !important;
-}
-
-a {
- color: var(--color--mediumblue) !important;
-}
-
-a:visited {
- color: #9B59B6 !important;
-}
-
-.wy-menu-vertical a {
- color: var(--color--coolgray2) !important;
-}
-
-.wy-menu-vertical li.current a {
- border-right: none !important;
- color: var(--color--coolgray4) !important;
-}
-
-.wy-menu-vertical li.current {
- background-color: var(--color--warmgraylight) !important;
-}
-
-.wy-menu-vertical li.toctree-l2.current>a {
- background-color: var(--color--coolgray1) !important;
-}
-
-.wy-menu-vertical a:hover, .wy-menu-vertical li.current a:hover, .wy-menu-vertical li.toctree-l2.current>a:hover {
- color: var(--color--warmgraylight) !important;
- background-color: var(--color--coolgray3) !important;
-}
-
-.wy-alert-title, .rst-content .admonition-title {
- background-color: var(--color--mediumblue) !important;
-}
-
-.wy-alert, .rst-content .note, .rst-content .attention, .rst-content .caution, .rst-content .danger, .rst-content .error, .rst-content .hint, .rst-content .important, .rst-content .tip, .rst-content .warning, .rst-content .seealso, .rst-content .admonition-todo, .rst-content .admonition {
- background-color: var(--color--warmgraylight) !important;
-}
-
-.rst-content dl:not(.docutils) dt {
- border-color: var(--color--mediumblue) !important;
- background-color: var(--color--warmgraylight) !important;
-}
-
-/* .rst-content pre.literal-block, .rst-content div[class^='highlight'] {
- background-color: var(--color--warmgraylight) !important;
-} */
-
-.wy-table-odd td, .wy-table-striped tr:nth-child(2n-1) td, .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td {
- background-color: var(--color--warmgraylight) !important;
-}
-
-@media screen and (min-width: 1100px) {
- .wy-nav-content-wrap {
- background-color: var(--color--warmgraylight) !important;
- }
-}
-
-.wy-side-nav-search img {
- height: auto !important;
- width: 100% !important;
- padding: 0 !important;
- background-color: inherit !important;
- border-radius: 0 !important;
- margin: 0 !important
-}
-
-.wy-side-nav-search>a, .wy-side-nav-search .wy-dropdown>a {
- margin-bottom: 0 !important;
-}
-
-.wy-menu-vertical li.toctree-l1.current>a {
- border: none !important;
-}
-
-.wy-side-nav-search>div.version {
- color: var(--color--coolgray2) !important;
-}
\ No newline at end of file
diff --git a/docs/_build/html/_static/css/theme.css b/docs/_build/html/_static/css/theme.css
deleted file mode 100644
index 03a13df62..000000000
--- a/docs/_build/html/_static/css/theme.css
+++ /dev/null
@@ -1,6 +0,0 @@
-/* sphinx_rtd_theme version 0.4.1 | MIT license */
-/* Built 20180727 10:07 */
-*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}audio:not([controls]){display:none}[hidden]{display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:hover,a:active{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:bold}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;color:#000;text-decoration:none}mark{background:#ff0;color:#000;font-style:italic;font-weight:bold}pre,code,.rst-content tt,.rst-content code,kbd,samp{font-family:monospace,serif;_font-family:"courier new",monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:before,q:after{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-0.5em}sub{bottom:-0.25em}ul,ol,dl{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure{margin:0}form{margin:0}fieldset{border:0;margin:0;padding:0}label{cursor:pointer}legend{border:0;*margin-left:-7px;padding:0;white-space:normal}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0;*width:13px;*height:13px}input[type="search"]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}textarea{overflow:auto;vertical-align:top;resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none !important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{html,body,section{background:none !important}*{box-shadow:none !important;text-shadow:none !important;filter:none !important;-ms-filter:none !important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:.5cm}p,h2,.rst-content .toctree-wrapper p.caption,h3{orphans:3;widows:3}h2,.rst-content .toctree-wrapper p.caption,h3{page-break-after:avoid}}.fa:before,.wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo,.rst-content .admonition,.btn,input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"],select,textarea,.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a,.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a,.wy-nav-top a{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}/*!
- * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome
- * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License)
- */@font-face{font-family:'FontAwesome';src:url("../fonts/fontawesome-webfont.eot?v=4.7.0");src:url("../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0") format("embedded-opentype"),url("../fonts/fontawesome-webfont.woff2?v=4.7.0") format("woff2"),url("../fonts/fontawesome-webfont.woff?v=4.7.0") format("woff"),url("../fonts/fontawesome-webfont.ttf?v=4.7.0") format("truetype"),url("../fonts/fontawesome-webfont.svg?v=4.7.0#fontawesomeregular") format("svg");font-weight:normal;font-style:normal}.fa,.wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.rst-content code.download span:first-child,.icon{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.3333333333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.2857142857em;text-align:center}.fa-ul{padding-left:0;margin-left:2.1428571429em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.1428571429em;width:2.1428571429em;top:.1428571429em;text-align:center}.fa-li.fa-lg{left:-1.8571428571em}.fa-border{padding:.2em .25em .15em;border:solid 0.08em #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left,.wy-menu-vertical li span.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a span.fa-pull-left.toctree-expand,.wy-menu-vertical li.current>a span.fa-pull-left.toctree-expand,.rst-content .fa-pull-left.admonition-title,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content dl dt .fa-pull-left.headerlink,.rst-content p.caption .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.rst-content code.download span.fa-pull-left:first-child,.fa-pull-left.icon{margin-right:.3em}.fa.fa-pull-right,.wy-menu-vertical li span.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a span.fa-pull-right.toctree-expand,.wy-menu-vertical li.current>a span.fa-pull-right.toctree-expand,.rst-content .fa-pull-right.admonition-title,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content dl dt .fa-pull-right.headerlink,.rst-content p.caption .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.rst-content code.download span.fa-pull-right:first-child,.fa-pull-right.icon{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.wy-menu-vertical li span.pull-left.toctree-expand,.wy-menu-vertical li.on a span.pull-left.toctree-expand,.wy-menu-vertical li.current>a span.pull-left.toctree-expand,.rst-content .pull-left.admonition-title,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content dl dt .pull-left.headerlink,.rst-content p.caption .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.rst-content code.download span.pull-left:first-child,.pull-left.icon{margin-right:.3em}.fa.pull-right,.wy-menu-vertical li span.pull-right.toctree-expand,.wy-menu-vertical li.on a span.pull-right.toctree-expand,.wy-menu-vertical li.current>a span.pull-right.toctree-expand,.rst-content .pull-right.admonition-title,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content dl dt .pull-right.headerlink,.rst-content p.caption .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.rst-content code.download span.pull-right:first-child,.pull-right.icon{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s infinite linear;animation:fa-spin 2s infinite linear}.fa-pulse{-webkit-animation:fa-spin 1s infinite steps(8);animation:fa-spin 1s infinite steps(8)}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scale(-1, 1);-ms-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scale(1, -1);-ms-transform:scale(1, -1);transform:scale(1, -1)}:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270,:root .fa-flip-horizontal,:root .fa-flip-vertical{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-remove:before,.fa-close:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-gear:before,.fa-cog:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-rotate-right:before,.fa-repeat:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.rst-content .admonition-title:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-warning:before,.fa-exclamation-triangle:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-gears:before,.fa-cogs:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-save:before,.fa-floppy-o:before{content:""}.fa-square:before{content:""}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.wy-dropdown .caret:before,.icon-caret-down:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-unsorted:before,.fa-sort:before{content:""}.fa-sort-down:before,.fa-sort-desc:before{content:""}.fa-sort-up:before,.fa-sort-asc:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-legal:before,.fa-gavel:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-flash:before,.fa-bolt:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-paste:before,.fa-clipboard:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-unlink:before,.fa-chain-broken:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:""}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:""}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:""}.fa-euro:before,.fa-eur:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-rupee:before,.fa-inr:before{content:""}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:""}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:""}.fa-won:before,.fa-krw:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-turkish-lira:before,.fa-try:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li span.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-institution:before,.fa-bank:before,.fa-university:before{content:""}.fa-mortar-board:before,.fa-graduation-cap:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:""}.fa-file-zip-o:before,.fa-file-archive-o:before{content:""}.fa-file-sound-o:before,.fa-file-audio-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-resistance:before,.fa-rebel:before{content:""}.fa-ge:before,.fa-empire:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-y-combinator-square:before,.fa-yc-square:before,.fa-hacker-news:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-send:before,.fa-paper-plane:before{content:""}.fa-send-o:before,.fa-paper-plane-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-soccer-ball-o:before,.fa-futbol-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-shekel:before,.fa-sheqel:before,.fa-ils:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-hotel:before,.fa-bed:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-yc:before,.fa-y-combinator:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery:before,.fa-battery-full:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-stop-o:before,.fa-hand-paper-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-tv:before,.fa-television:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-asl-interpreting:before,.fa-american-sign-language-interpreting:before{content:""}.fa-deafness:before,.fa-hard-of-hearing:before,.fa-deaf:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-signing:before,.fa-sign-language:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-vcard:before,.fa-address-card:before{content:""}.fa-vcard-o:before,.fa-address-card-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer:before,.fa-thermometer-full:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bathtub:before,.fa-s15:before,.fa-bath:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.rst-content code.download span:first-child,.icon,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context{font-family:inherit}.fa:before,.wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li.on a span.toctree-expand:before,.wy-menu-vertical li.current>a span.toctree-expand:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before{font-family:"FontAwesome";display:inline-block;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa,a .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li a span.toctree-expand,.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand,a .rst-content .admonition-title,.rst-content a .admonition-title,a .rst-content h1 .headerlink,.rst-content h1 a .headerlink,a .rst-content h2 .headerlink,.rst-content h2 a .headerlink,a .rst-content h3 .headerlink,.rst-content h3 a .headerlink,a .rst-content h4 .headerlink,.rst-content h4 a .headerlink,a .rst-content h5 .headerlink,.rst-content h5 a .headerlink,a .rst-content h6 .headerlink,.rst-content h6 a .headerlink,a .rst-content dl dt .headerlink,.rst-content dl dt a .headerlink,a .rst-content p.caption .headerlink,.rst-content p.caption a .headerlink,a .rst-content table>caption .headerlink,.rst-content table>caption a .headerlink,a .rst-content tt.download span:first-child,.rst-content tt.download a span:first-child,a .rst-content code.download span:first-child,.rst-content code.download a span:first-child,a .icon{display:inline-block;text-decoration:inherit}.btn .fa,.btn .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .btn span.toctree-expand,.btn .wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.on a .btn span.toctree-expand,.btn .wy-menu-vertical li.current>a span.toctree-expand,.wy-menu-vertical li.current>a .btn span.toctree-expand,.btn .rst-content .admonition-title,.rst-content .btn .admonition-title,.btn .rst-content h1 .headerlink,.rst-content h1 .btn .headerlink,.btn .rst-content h2 .headerlink,.rst-content h2 .btn .headerlink,.btn .rst-content h3 .headerlink,.rst-content h3 .btn .headerlink,.btn .rst-content h4 .headerlink,.rst-content h4 .btn .headerlink,.btn .rst-content h5 .headerlink,.rst-content h5 .btn .headerlink,.btn .rst-content h6 .headerlink,.rst-content h6 .btn .headerlink,.btn .rst-content dl dt .headerlink,.rst-content dl dt .btn .headerlink,.btn .rst-content p.caption .headerlink,.rst-content p.caption .btn .headerlink,.btn .rst-content table>caption .headerlink,.rst-content table>caption .btn .headerlink,.btn .rst-content tt.download span:first-child,.rst-content tt.download .btn span:first-child,.btn .rst-content code.download span:first-child,.rst-content code.download .btn span:first-child,.btn .icon,.nav .fa,.nav .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .nav span.toctree-expand,.nav .wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.on a .nav span.toctree-expand,.nav .wy-menu-vertical li.current>a span.toctree-expand,.wy-menu-vertical li.current>a .nav span.toctree-expand,.nav .rst-content .admonition-title,.rst-content .nav .admonition-title,.nav .rst-content h1 .headerlink,.rst-content h1 .nav .headerlink,.nav .rst-content h2 .headerlink,.rst-content h2 .nav .headerlink,.nav .rst-content h3 .headerlink,.rst-content h3 .nav .headerlink,.nav .rst-content h4 .headerlink,.rst-content h4 .nav .headerlink,.nav .rst-content h5 .headerlink,.rst-content h5 .nav .headerlink,.nav .rst-content h6 .headerlink,.rst-content h6 .nav .headerlink,.nav .rst-content dl dt .headerlink,.rst-content dl dt .nav .headerlink,.nav .rst-content p.caption .headerlink,.rst-content p.caption .nav .headerlink,.nav .rst-content table>caption .headerlink,.rst-content table>caption .nav .headerlink,.nav .rst-content tt.download span:first-child,.rst-content tt.download .nav span:first-child,.nav .rst-content code.download span:first-child,.rst-content code.download .nav span:first-child,.nav .icon{display:inline}.btn .fa.fa-large,.btn .wy-menu-vertical li span.fa-large.toctree-expand,.wy-menu-vertical li .btn span.fa-large.toctree-expand,.btn .rst-content .fa-large.admonition-title,.rst-content .btn .fa-large.admonition-title,.btn .rst-content h1 .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.btn .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .btn .fa-large.headerlink,.btn .rst-content p.caption .fa-large.headerlink,.rst-content p.caption .btn .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.rst-content tt.download .btn span.fa-large:first-child,.btn .rst-content code.download span.fa-large:first-child,.rst-content code.download .btn span.fa-large:first-child,.btn .fa-large.icon,.nav .fa.fa-large,.nav .wy-menu-vertical li span.fa-large.toctree-expand,.wy-menu-vertical li .nav span.fa-large.toctree-expand,.nav .rst-content .fa-large.admonition-title,.rst-content .nav .fa-large.admonition-title,.nav .rst-content h1 .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.nav .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.nav .rst-content p.caption .fa-large.headerlink,.rst-content p.caption .nav .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.nav .rst-content code.download span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.nav .fa-large.icon{line-height:.9em}.btn .fa.fa-spin,.btn .wy-menu-vertical li span.fa-spin.toctree-expand,.wy-menu-vertical li .btn span.fa-spin.toctree-expand,.btn .rst-content .fa-spin.admonition-title,.rst-content .btn .fa-spin.admonition-title,.btn .rst-content h1 .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.btn .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .btn .fa-spin.headerlink,.btn .rst-content p.caption .fa-spin.headerlink,.rst-content p.caption .btn .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.rst-content tt.download .btn span.fa-spin:first-child,.btn .rst-content code.download span.fa-spin:first-child,.rst-content code.download .btn span.fa-spin:first-child,.btn .fa-spin.icon,.nav .fa.fa-spin,.nav .wy-menu-vertical li span.fa-spin.toctree-expand,.wy-menu-vertical li .nav span.fa-spin.toctree-expand,.nav .rst-content .fa-spin.admonition-title,.rst-content .nav .fa-spin.admonition-title,.nav .rst-content h1 .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.nav .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.nav .rst-content p.caption .fa-spin.headerlink,.rst-content p.caption .nav .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.nav .rst-content code.download span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.nav .fa-spin.icon{display:inline-block}.btn.fa:before,.wy-menu-vertical li span.btn.toctree-expand:before,.rst-content .btn.admonition-title:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content dl dt .btn.headerlink:before,.rst-content p.caption .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.rst-content code.download span.btn:first-child:before,.btn.icon:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.wy-menu-vertical li span.btn.toctree-expand:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content p.caption .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.rst-content code.download span.btn:first-child:hover:before,.btn.icon:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .wy-menu-vertical li span.toctree-expand:before,.wy-menu-vertical li .btn-mini span.toctree-expand:before,.btn-mini .rst-content .admonition-title:before,.rst-content .btn-mini .admonition-title:before,.btn-mini .rst-content h1 .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.btn-mini .rst-content dl dt .headerlink:before,.rst-content dl dt .btn-mini .headerlink:before,.btn-mini .rst-content p.caption .headerlink:before,.rst-content p.caption .btn-mini .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.rst-content tt.download .btn-mini span:first-child:before,.btn-mini .rst-content code.download span:first-child:before,.rst-content code.download .btn-mini span:first-child:before,.btn-mini .icon:before{font-size:14px;vertical-align:-15%}.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo,.rst-content .admonition{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.wy-alert-title,.rst-content .admonition-title{color:#fff;font-weight:bold;display:block;color:#fff;background:#6ab0de;margin:-12px;padding:6px 12px;margin-bottom:12px}.wy-alert.wy-alert-danger,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.admonition{background:#fdf3f2}.wy-alert.wy-alert-danger .wy-alert-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .danger .wy-alert-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .danger .admonition-title,.rst-content .error .admonition-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition .admonition-title{background:#f29f97}.wy-alert.wy-alert-warning,.rst-content .wy-alert-warning.note,.rst-content .attention,.rst-content .caution,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.tip,.rst-content .warning,.rst-content .wy-alert-warning.seealso,.rst-content .admonition-todo,.rst-content .wy-alert-warning.admonition{background:#ffedcc}.wy-alert.wy-alert-warning .wy-alert-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .attention .wy-alert-title,.rst-content .caution .wy-alert-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .attention .admonition-title,.rst-content .caution .admonition-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .warning .admonition-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .admonition-todo .admonition-title,.rst-content .wy-alert-warning.admonition .admonition-title{background:#f0b37e}.wy-alert.wy-alert-info,.rst-content .note,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.rst-content .seealso,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.admonition{background:#e7f2fa}.wy-alert.wy-alert-info .wy-alert-title,.rst-content .note .wy-alert-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.rst-content .note .admonition-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .seealso .admonition-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition .admonition-title{background:#6ab0de}.wy-alert.wy-alert-success,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.warning,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.admonition{background:#dbfaf4}.wy-alert.wy-alert-success .wy-alert-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .hint .wy-alert-title,.rst-content .important .wy-alert-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .hint .admonition-title,.rst-content .important .admonition-title,.rst-content .tip .admonition-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition .admonition-title{background:#1abc9c}.wy-alert.wy-alert-neutral,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.admonition{background:#f3f6f6}.wy-alert.wy-alert-neutral .wy-alert-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition .admonition-title{color:#404040;background:#e1e4e5}.wy-alert.wy-alert-neutral a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a{color:#2980B9}.wy-alert p:last-child,.rst-content .note p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.rst-content .seealso p:last-child,.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0px;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,0.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27AE60}.wy-tray-container li.wy-tray-item-info{background:#2980B9}.wy-tray-container li.wy-tray-item-warning{background:#E67E22}.wy-tray-container li.wy-tray-item-danger{background:#E74C3C}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width: 768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px 12px;color:#fff;border:1px solid rgba(0,0,0,0.1);background-color:#27AE60;text-decoration:none;font-weight:normal;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:0px 1px 2px -1px rgba(255,255,255,0.5) inset,0px -2px 0px 0px rgba(0,0,0,0.1) inset;outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:0px -1px 0px 0px rgba(0,0,0,0.05) inset,0px 2px 0px 0px rgba(0,0,0,0.1) inset;padding:8px 12px 6px 12px}.btn:visited{color:#fff}.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn-disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn-disabled:hover,.btn-disabled:focus,.btn-disabled:active{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980B9 !important}.btn-info:hover{background-color:#2e8ece !important}.btn-neutral{background-color:#f3f6f6 !important;color:#404040 !important}.btn-neutral:hover{background-color:#e5ebeb !important;color:#404040}.btn-neutral:visited{color:#404040 !important}.btn-success{background-color:#27AE60 !important}.btn-success:hover{background-color:#295 !important}.btn-danger{background-color:#E74C3C !important}.btn-danger:hover{background-color:#ea6153 !important}.btn-warning{background-color:#E67E22 !important}.btn-warning:hover{background-color:#e98b39 !important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f !important}.btn-link{background-color:transparent !important;color:#2980B9;box-shadow:none;border-color:transparent !important}.btn-link:hover{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:active{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:visited{color:#9B59B6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:before,.wy-btn-group:after{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:solid 1px #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,0.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980B9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:solid 1px #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type="search"]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980B9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned input,.wy-form-aligned textarea,.wy-form-aligned select,.wy-form-aligned .wy-help-inline,.wy-form-aligned label{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{border:0;margin:0;padding:0}legend{display:block;width:100%;border:0;padding:0;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label{display:block;margin:0 0 .3125em 0;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;*zoom:1;max-width:68em;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#E74C3C}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full input[type="text"],.wy-control-group .wy-form-full input[type="password"],.wy-control-group .wy-form-full input[type="email"],.wy-control-group .wy-form-full input[type="url"],.wy-control-group .wy-form-full input[type="date"],.wy-control-group .wy-form-full input[type="month"],.wy-control-group .wy-form-full input[type="time"],.wy-control-group .wy-form-full input[type="datetime"],.wy-control-group .wy-form-full input[type="datetime-local"],.wy-control-group .wy-form-full input[type="week"],.wy-control-group .wy-form-full input[type="number"],.wy-control-group .wy-form-full input[type="search"],.wy-control-group .wy-form-full input[type="tel"],.wy-control-group .wy-form-full input[type="color"],.wy-control-group .wy-form-halves input[type="text"],.wy-control-group .wy-form-halves input[type="password"],.wy-control-group .wy-form-halves input[type="email"],.wy-control-group .wy-form-halves input[type="url"],.wy-control-group .wy-form-halves input[type="date"],.wy-control-group .wy-form-halves input[type="month"],.wy-control-group .wy-form-halves input[type="time"],.wy-control-group .wy-form-halves input[type="datetime"],.wy-control-group .wy-form-halves input[type="datetime-local"],.wy-control-group .wy-form-halves input[type="week"],.wy-control-group .wy-form-halves input[type="number"],.wy-control-group .wy-form-halves input[type="search"],.wy-control-group .wy-form-halves input[type="tel"],.wy-control-group .wy-form-halves input[type="color"],.wy-control-group .wy-form-thirds input[type="text"],.wy-control-group .wy-form-thirds input[type="password"],.wy-control-group .wy-form-thirds input[type="email"],.wy-control-group .wy-form-thirds input[type="url"],.wy-control-group .wy-form-thirds input[type="date"],.wy-control-group .wy-form-thirds input[type="month"],.wy-control-group .wy-form-thirds input[type="time"],.wy-control-group .wy-form-thirds input[type="datetime"],.wy-control-group .wy-form-thirds input[type="datetime-local"],.wy-control-group .wy-form-thirds input[type="week"],.wy-control-group .wy-form-thirds input[type="number"],.wy-control-group .wy-form-thirds input[type="search"],.wy-control-group .wy-form-thirds input[type="tel"],.wy-control-group .wy-form-thirds input[type="color"]{width:100%}.wy-control-group .wy-form-full{float:left;display:block;margin-right:2.3576515979%;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.3576515979%;width:48.821174201%}.wy-control-group .wy-form-halves:last-child{margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n+1){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.3576515979%;width:31.7615656014%}.wy-control-group .wy-form-thirds:last-child{margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control{margin:6px 0 0 0;font-size:90%}.wy-control-no-input{display:inline-block;margin:6px 0 0 0;font-size:90%}.wy-control-group.fluid-input input[type="text"],.wy-control-group.fluid-input input[type="password"],.wy-control-group.fluid-input input[type="email"],.wy-control-group.fluid-input input[type="url"],.wy-control-group.fluid-input input[type="date"],.wy-control-group.fluid-input input[type="month"],.wy-control-group.fluid-input input[type="time"],.wy-control-group.fluid-input input[type="datetime"],.wy-control-group.fluid-input input[type="datetime-local"],.wy-control-group.fluid-input input[type="week"],.wy-control-group.fluid-input input[type="number"],.wy-control-group.fluid-input input[type="search"],.wy-control-group.fluid-input input[type="tel"],.wy-control-group.fluid-input input[type="color"]{width:100%}.wy-form-message-inline{display:inline-block;padding-left:.3em;color:#666;vertical-align:middle;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;*overflow:visible}input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type="datetime-local"]{padding:.34375em .625em}input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type="search"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type="search"]::-webkit-search-cancel-button,input[type="search"]::-webkit-search-decoration{-webkit-appearance:none}input[type="text"]:focus,input[type="password"]:focus,input[type="email"]:focus,input[type="url"]:focus,input[type="date"]:focus,input[type="month"]:focus,input[type="time"]:focus,input[type="datetime"]:focus,input[type="datetime-local"]:focus,input[type="week"]:focus,input[type="number"]:focus,input[type="search"]:focus,input[type="tel"]:focus,input[type="color"]:focus{outline:0;outline:thin dotted \9;border-color:#333}input.no-focus:focus{border-color:#ccc !important}input[type="file"]:focus,input[type="radio"]:focus,input[type="checkbox"]:focus{outline:thin dotted #333;outline:1px auto #129FEA}input[type="text"][disabled],input[type="password"][disabled],input[type="email"][disabled],input[type="url"][disabled],input[type="date"][disabled],input[type="month"][disabled],input[type="time"][disabled],input[type="datetime"][disabled],input[type="datetime-local"][disabled],input[type="week"][disabled],input[type="number"][disabled],input[type="search"][disabled],input[type="tel"][disabled],input[type="color"][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,textarea:focus:invalid,select:focus:invalid{color:#E74C3C;border:1px solid #E74C3C}input:focus:invalid:focus,textarea:focus:invalid:focus,select:focus:invalid:focus{border-color:#E74C3C}input[type="file"]:focus:invalid:focus,input[type="radio"]:focus:invalid:focus,input[type="checkbox"]:focus:invalid:focus{outline-color:#E74C3C}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type="radio"][disabled],input[type="checkbox"][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:solid 1px #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{position:absolute;content:"";display:block;left:0;top:0;width:36px;height:12px;border-radius:4px;background:#ccc;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{position:absolute;content:"";display:block;width:18px;height:18px;border-radius:4px;background:#999;left:-3px;top:-3px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27AE60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#E74C3C}.wy-control-group.wy-control-group-error input[type="text"],.wy-control-group.wy-control-group-error input[type="password"],.wy-control-group.wy-control-group-error input[type="email"],.wy-control-group.wy-control-group-error input[type="url"],.wy-control-group.wy-control-group-error input[type="date"],.wy-control-group.wy-control-group-error input[type="month"],.wy-control-group.wy-control-group-error input[type="time"],.wy-control-group.wy-control-group-error input[type="datetime"],.wy-control-group.wy-control-group-error input[type="datetime-local"],.wy-control-group.wy-control-group-error input[type="week"],.wy-control-group.wy-control-group-error input[type="number"],.wy-control-group.wy-control-group-error input[type="search"],.wy-control-group.wy-control-group-error input[type="tel"],.wy-control-group.wy-control-group-error input[type="color"]{border:solid 1px #E74C3C}.wy-control-group.wy-control-group-error textarea{border:solid 1px #E74C3C}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27AE60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#E74C3C}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#E67E22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980B9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width: 480px){.wy-form button[type="submit"]{margin:.7em 0 0}.wy-form input[type="text"],.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:.3em;display:block}.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0 0}.wy-form .wy-help-inline,.wy-form-message-inline,.wy-form-message{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width: 768px){.tablet-hide{display:none}}@media screen and (max-width: 480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.wy-table,.rst-content table.docutils,.rst-content table.field-list{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.wy-table caption,.rst-content table.docutils caption,.rst-content table.field-list caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td,.wy-table th,.rst-content table.docutils th,.rst-content table.field-list th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.wy-table td:first-child,.rst-content table.docutils td:first-child,.rst-content table.field-list td:first-child,.wy-table th:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list th:first-child{border-left-width:0}.wy-table thead,.rst-content table.docutils thead,.rst-content table.field-list thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.wy-table thead th,.rst-content table.docutils thead th,.rst-content table.field-list thead th{font-weight:bold;border-bottom:solid 2px #e1e4e5}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td{background-color:transparent;vertical-align:middle}.wy-table td p,.rst-content table.docutils td p,.rst-content table.field-list td p{line-height:18px}.wy-table td p:last-child,.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child{margin-bottom:0}.wy-table .wy-table-cell-min,.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min{width:1%;padding-right:0}.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:gray;font-size:90%}.wy-table-tertiary{color:gray;font-size:80%}.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td,.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td{background-color:#f3f6f6}.wy-table-backed{background-color:#f3f6f6}.wy-table-bordered-all,.rst-content table.docutils{border:1px solid #e1e4e5}.wy-table-bordered-all td,.rst-content table.docutils td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.wy-table-bordered-all tbody>tr:last-child td,.rst-content table.docutils tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px 0;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0 !important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980B9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9B59B6}html{height:100%;overflow-x:hidden}body{font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;font-weight:normal;color:#404040;min-height:100%;overflow-x:hidden;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#E67E22 !important}a.wy-text-warning:hover{color:#eb9950 !important}.wy-text-info{color:#2980B9 !important}a.wy-text-info:hover{color:#409ad5 !important}.wy-text-success{color:#27AE60 !important}a.wy-text-success:hover{color:#36d278 !important}.wy-text-danger{color:#E74C3C !important}a.wy-text-danger:hover{color:#ed7669 !important}.wy-text-neutral{color:#404040 !important}a.wy-text-neutral:hover{color:#595959 !important}h1,h2,.rst-content .toctree-wrapper p.caption,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif}p{line-height:24px;margin:0;font-size:16px;margin-bottom:24px}h1{font-size:175%}h2,.rst-content .toctree-wrapper p.caption{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}code,.rst-content tt,.rst-content code{white-space:nowrap;max-width:100%;background:#fff;border:solid 1px #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;color:#E74C3C;overflow-x:auto}code.code-large,.rst-content tt.code-large{font-size:90%}.wy-plain-list-disc,.rst-content .section ul,.rst-content .toctree-wrapper ul,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.wy-plain-list-disc li,.rst-content .section ul li,.rst-content .toctree-wrapper ul li,article ul li{list-style:disc;margin-left:24px}.wy-plain-list-disc li p:last-child,.rst-content .section ul li p:last-child,.rst-content .toctree-wrapper ul li p:last-child,article ul li p:last-child{margin-bottom:0}.wy-plain-list-disc li ul,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li ul,article ul li ul{margin-bottom:0}.wy-plain-list-disc li li,.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,article ul li li{list-style:circle}.wy-plain-list-disc li li li,.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,article ul li li li{list-style:square}.wy-plain-list-disc li ol li,.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,article ul li ol li{list-style:decimal}.wy-plain-list-decimal,.rst-content .section ol,.rst-content ol.arabic,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.wy-plain-list-decimal li,.rst-content .section ol li,.rst-content ol.arabic li,article ol li{list-style:decimal;margin-left:24px}.wy-plain-list-decimal li p:last-child,.rst-content .section ol li p:last-child,.rst-content ol.arabic li p:last-child,article ol li p:last-child{margin-bottom:0}.wy-plain-list-decimal li ul,.rst-content .section ol li ul,.rst-content ol.arabic li ul,article ol li ul{margin-bottom:0}.wy-plain-list-decimal li ul li,.rst-content .section ol li ul li,.rst-content ol.arabic li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:before,.wy-breadcrumbs:after{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs li{display:inline-block}.wy-breadcrumbs li.wy-breadcrumbs-aside{float:right}.wy-breadcrumbs li a{display:inline-block;padding:5px}.wy-breadcrumbs li a:first-child{padding-left:0}.wy-breadcrumbs li code,.wy-breadcrumbs li .rst-content tt,.rst-content .wy-breadcrumbs li tt{padding:5px;border:none;background:none}.wy-breadcrumbs li code.literal,.wy-breadcrumbs li .rst-content tt.literal,.rst-content .wy-breadcrumbs li tt.literal{color:#404040}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width: 480px){.wy-breadcrumbs-extra{display:none}.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:before,.wy-menu-horiz:after{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz ul,.wy-menu-horiz li{display:inline-block}.wy-menu-horiz li:hover{background:rgba(255,255,255,0.1)}.wy-menu-horiz li.divide-left{border-left:solid 1px #404040}.wy-menu-horiz li.divide-right{border-right:solid 1px #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{height:32px;display:inline-block;line-height:32px;padding:0 1.618em;margin-bottom:0;display:block;font-weight:bold;text-transform:uppercase;font-size:80%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:solid 1px #404040}.wy-menu-vertical li.divide-bottom{border-bottom:solid 1px #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:gray;border-right:solid 1px #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.wy-menu-vertical li code,.wy-menu-vertical li .rst-content tt,.rst-content .wy-menu-vertical li tt{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li span.toctree-expand{display:block;float:left;margin-left:-1.2em;font-size:.8em;line-height:1.6em;color:#4d4d4d}.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a{color:#404040;padding:.4045em 1.618em;font-weight:bold;position:relative;background:#fcfcfc;border:none;padding-left:1.618em -4px}.wy-menu-vertical li.on a:hover,.wy-menu-vertical li.current>a:hover{background:#fcfcfc}.wy-menu-vertical li.on a:hover span.toctree-expand,.wy-menu-vertical li.current>a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.on a span.toctree-expand,.wy-menu-vertical li.current>a span.toctree-expand{display:block;font-size:.8em;line-height:1.6em;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:solid 1px #c9c9c9;border-top:solid 1px #c9c9c9}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a{color:#404040}.wy-menu-vertical li.toctree-l1.current li.toctree-l2>ul,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>ul{display:none}.wy-menu-vertical li.toctree-l1.current li.toctree-l2.current>ul,.wy-menu-vertical li.toctree-l2.current li.toctree-l3.current>ul{display:block}.wy-menu-vertical li.toctree-l2.current>a{background:#c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{display:block;background:#c9c9c9;padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l2 a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.toctree-l2 span.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3{font-size:.9em}.wy-menu-vertical li.toctree-l3.current>a{background:#bdbdbd;padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{display:block;background:#bdbdbd;padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l3 a:hover span.toctree-expand{color:gray}.wy-menu-vertical li.toctree-l3 span.toctree-expand{color:#969696}.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:normal}.wy-menu-vertical a{display:inline-block;line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover span.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980B9;cursor:pointer;color:#fff}.wy-menu-vertical a:active span.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980B9;text-align:center;padding:.809em;display:block;color:#fcfcfc;margin-bottom:.809em}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em auto;height:45px;width:45px;background-color:#2980B9;padding:5px;border-radius:100%}.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a{color:#fcfcfc;font-size:100%;font-weight:bold;display:inline-block;padding:4px 6px;margin-bottom:.809em}.wy-side-nav-search>a:hover,.wy-side-nav-search .wy-dropdown>a:hover{background:rgba(255,255,255,0.1)}.wy-side-nav-search>a img.logo,.wy-side-nav-search .wy-dropdown>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search>a.icon img.logo,.wy-side-nav-search .wy-dropdown>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:normal;color:rgba(255,255,255,0.3)}.wy-nav .wy-menu-vertical header{color:#2980B9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980B9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980B9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:before,.wy-nav-top:after{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:bold}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980B9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,0.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:gray}footer p{margin-bottom:12px}footer span.commit code,footer span.commit .rst-content tt,.rst-content footer span.commit tt{padding:0px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;font-size:1em;background:none;border:none;color:gray}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:before,.rst-footer-buttons:after{width:100%}.rst-footer-buttons:before,.rst-footer-buttons:after{display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:before,.rst-breadcrumbs-buttons:after{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:solid 1px #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:solid 1px #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:gray;font-size:90%}@media screen and (max-width: 768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-side-scroll{width:auto}.wy-side-nav-search{width:auto}.wy-menu.wy-menu-vertical{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width: 1100px){.wy-nav-content-wrap{background:rgba(0,0,0,0.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,footer,.wy-nav-side{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980B9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27AE60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .wy-menu-vertical li span.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version span.toctree-expand,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content p.caption .headerlink,.rst-content p.caption .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .icon{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#E74C3C;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#F1C40F;color:#000}.rst-versions.shift-up{height:auto;max-height:100%}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content img{max-width:100%;height:auto}.rst-content div.figure{margin-bottom:24px}.rst-content div.figure p.caption{font-style:italic}.rst-content div.figure p:last-child.caption{margin-bottom:0px}.rst-content div.figure.align-center{text-align:center}.rst-content .section>img,.rst-content .section>a>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px 12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;display:block;overflow:auto}.rst-content pre.literal-block,.rst-content div[class^='highlight']{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px 0}.rst-content pre.literal-block div[class^='highlight'],.rst-content div[class^='highlight'] div[class^='highlight']{padding:0px;border:none;margin:0}.rst-content div[class^='highlight'] td.code{width:100%}.rst-content .linenodiv pre{border-right:solid 1px #e6e9ea;margin:0;padding:12px 12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^='highlight'] pre{white-space:pre;margin:0;padding:12px 12px;display:block;overflow:auto}.rst-content div[class^='highlight'] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content pre.literal-block,.rst-content div[class^='highlight'] pre,.rst-content .linenodiv pre{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;font-size:12px;line-height:1.4}@media print{.rst-content .codeblock,.rst-content div[class^='highlight'],.rst-content div[class^='highlight'] pre{white-space:pre-wrap}}.rst-content .note .last,.rst-content .attention .last,.rst-content .caution .last,.rst-content .danger .last,.rst-content .error .last,.rst-content .hint .last,.rst-content .important .last,.rst-content .tip .last,.rst-content .warning .last,.rst-content .seealso .last,.rst-content .admonition-todo .last,.rst-content .admonition .last{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,0.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent !important;border-color:rgba(0,0,0,0.1) !important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha li{list-style:upper-alpha}.rst-content .section ol p,.rst-content .section ul p{margin-bottom:12px}.rst-content .section ol p:last-child,.rst-content .section ul p:last-child{margin-bottom:24px}.rst-content .line-block{margin-left:0px;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0px}.rst-content .topic-title{font-weight:bold;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0px 0px 24px 24px}.rst-content .align-left{float:left;margin:0px 24px 24px 0px}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content .toctree-wrapper p.caption .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.rst-content p.caption .headerlink,.rst-content table>caption .headerlink{visibility:hidden;font-size:14px}.rst-content h1 .headerlink:after,.rst-content h2 .headerlink:after,.rst-content .toctree-wrapper p.caption .headerlink:after,.rst-content h3 .headerlink:after,.rst-content h4 .headerlink:after,.rst-content h5 .headerlink:after,.rst-content h6 .headerlink:after,.rst-content dl dt .headerlink:after,.rst-content p.caption .headerlink:after,.rst-content table>caption .headerlink:after{content:"";font-family:FontAwesome}.rst-content h1:hover .headerlink:after,.rst-content h2:hover .headerlink:after,.rst-content .toctree-wrapper p.caption:hover .headerlink:after,.rst-content h3:hover .headerlink:after,.rst-content h4:hover .headerlink:after,.rst-content h5:hover .headerlink:after,.rst-content h6:hover .headerlink:after,.rst-content dl dt:hover .headerlink:after,.rst-content p.caption:hover .headerlink:after,.rst-content table>caption:hover .headerlink:after{visibility:visible}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:solid 1px #e1e4e5}.rst-content .sidebar p,.rst-content .sidebar ul,.rst-content .sidebar dl{font-size:90%}.rst-content .sidebar .last{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif;font-weight:bold;background:#e1e4e5;padding:6px 12px;margin:-24px;margin-bottom:24px;font-size:100%}.rst-content .highlighted{background:#F1C40F;display:inline-block;font-weight:bold;padding:0 6px}.rst-content .footnote-reference,.rst-content .citation-reference{vertical-align:baseline;position:relative;top:-0.4em;line-height:0;font-size:90%}.rst-content table.docutils.citation,.rst-content table.docutils.footnote{background:none;border:none;color:gray}.rst-content table.docutils.citation td,.rst-content table.docutils.citation tr,.rst-content table.docutils.footnote td,.rst-content table.docutils.footnote tr{border:none;background-color:transparent !important;white-space:normal}.rst-content table.docutils.citation td.label,.rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}.rst-content table.docutils.citation tt,.rst-content table.docutils.citation code,.rst-content table.docutils.footnote tt,.rst-content table.docutils.footnote code{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}.rst-content table.docutils td .last,.rst-content table.docutils td .last :last-child{margin-bottom:0}.rst-content table.field-list{border:none}.rst-content table.field-list td{border:none}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content tt,.rst-content tt,.rst-content code{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;padding:2px 5px}.rst-content tt big,.rst-content tt em,.rst-content tt big,.rst-content code big,.rst-content tt em,.rst-content code em{font-size:100% !important;line-height:normal}.rst-content tt.literal,.rst-content tt.literal,.rst-content code.literal{color:#E74C3C}.rst-content tt.xref,a .rst-content tt,.rst-content tt.xref,.rst-content code.xref,a .rst-content tt,a .rst-content code{font-weight:bold;color:#404040}.rst-content pre,.rst-content kbd,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace}.rst-content a tt,.rst-content a tt,.rst-content a code{color:#2980B9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:bold;margin-bottom:12px}.rst-content dl p,.rst-content dl table,.rst-content dl ul,.rst-content dl ol{margin-bottom:12px !important}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl:not(.docutils){margin-bottom:24px}.rst-content dl:not(.docutils) dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980B9;border-top:solid 3px #6ab0de;padding:6px;position:relative}.rst-content dl:not(.docutils) dt:before{color:#6ab0de}.rst-content dl:not(.docutils) dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dl dt{margin-bottom:6px;border:none;border-left:solid 3px #ccc;background:#f0f0f0;color:#555}.rst-content dl:not(.docutils) dl dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dt:first-child{margin-top:0}.rst-content dl:not(.docutils) tt,.rst-content dl:not(.docutils) tt,.rst-content dl:not(.docutils) code{font-weight:bold}.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) tt.descclassname,.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) code.descname,.rst-content dl:not(.docutils) tt.descclassname,.rst-content dl:not(.docutils) code.descclassname{background-color:transparent;border:none;padding:0;font-size:100% !important}.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) code.descname{font-weight:bold}.rst-content dl:not(.docutils) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:bold}.rst-content dl:not(.docutils) .property{display:inline-block;padding-right:8px}.rst-content .viewcode-link,.rst-content .viewcode-back{display:inline-block;color:#27AE60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:bold}.rst-content tt.download,.rst-content code.download{background:inherit;padding:inherit;font-weight:normal;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content tt.download span:first-child,.rst-content code.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content tt.download span:first-child:before,.rst-content code.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .versionmodified{font-style:italic}@media screen and (max-width: 480px){.rst-content .sidebar{width:100%}}span[id*='MathJax-Span']{color:#404040}.math{text-align:center}@font-face{font-family:"Lato";src:url("../fonts/Lato/lato-regular.eot");src:url("../fonts/Lato/lato-regular.eot?#iefix") format("embedded-opentype"),url("../fonts/Lato/lato-regular.woff2") format("woff2"),url("../fonts/Lato/lato-regular.woff") format("woff"),url("../fonts/Lato/lato-regular.ttf") format("truetype");font-weight:400;font-style:normal}@font-face{font-family:"Lato";src:url("../fonts/Lato/lato-bold.eot");src:url("../fonts/Lato/lato-bold.eot?#iefix") format("embedded-opentype"),url("../fonts/Lato/lato-bold.woff2") format("woff2"),url("../fonts/Lato/lato-bold.woff") format("woff"),url("../fonts/Lato/lato-bold.ttf") format("truetype");font-weight:700;font-style:normal}@font-face{font-family:"Lato";src:url("../fonts/Lato/lato-bolditalic.eot");src:url("../fonts/Lato/lato-bolditalic.eot?#iefix") format("embedded-opentype"),url("../fonts/Lato/lato-bolditalic.woff2") format("woff2"),url("../fonts/Lato/lato-bolditalic.woff") format("woff"),url("../fonts/Lato/lato-bolditalic.ttf") format("truetype");font-weight:700;font-style:italic}@font-face{font-family:"Lato";src:url("../fonts/Lato/lato-italic.eot");src:url("../fonts/Lato/lato-italic.eot?#iefix") format("embedded-opentype"),url("../fonts/Lato/lato-italic.woff2") format("woff2"),url("../fonts/Lato/lato-italic.woff") format("woff"),url("../fonts/Lato/lato-italic.ttf") format("truetype");font-weight:400;font-style:italic}@font-face{font-family:"Roboto Slab";font-style:normal;font-weight:400;src:url("../fonts/RobotoSlab/roboto-slab.eot");src:url("../fonts/RobotoSlab/roboto-slab-v7-regular.eot?#iefix") format("embedded-opentype"),url("../fonts/RobotoSlab/roboto-slab-v7-regular.woff2") format("woff2"),url("../fonts/RobotoSlab/roboto-slab-v7-regular.woff") format("woff"),url("../fonts/RobotoSlab/roboto-slab-v7-regular.ttf") format("truetype")}@font-face{font-family:"Roboto Slab";font-style:normal;font-weight:700;src:url("../fonts/RobotoSlab/roboto-slab-v7-bold.eot");src:url("../fonts/RobotoSlab/roboto-slab-v7-bold.eot?#iefix") format("embedded-opentype"),url("../fonts/RobotoSlab/roboto-slab-v7-bold.woff2") format("woff2"),url("../fonts/RobotoSlab/roboto-slab-v7-bold.woff") format("woff"),url("../fonts/RobotoSlab/roboto-slab-v7-bold.ttf") format("truetype")}
diff --git a/docs/_build/html/_static/doctools.js b/docs/_build/html/_static/doctools.js
deleted file mode 100644
index 565497723..000000000
--- a/docs/_build/html/_static/doctools.js
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * doctools.js
- * ~~~~~~~~~~~
- *
- * Sphinx JavaScript utilities for all documentation.
- *
- * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
- */
-
-/**
- * select a different prefix for underscore
- */
-$u = _.noConflict();
-
-/**
- * make the code below compatible with browsers without
- * an installed firebug like debugger
-if (!window.console || !console.firebug) {
- var names = ["log", "debug", "info", "warn", "error", "assert", "dir",
- "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace",
- "profile", "profileEnd"];
- window.console = {};
- for (var i = 0; i < names.length; ++i)
- window.console[names[i]] = function() {};
-}
- */
-
-/**
- * small helper function to urldecode strings
- */
-jQuery.urldecode = function(x) {
- return decodeURIComponent(x).replace(/\+/g, ' ');
-};
-
-/**
- * small helper function to urlencode strings
- */
-jQuery.urlencode = encodeURIComponent;
-
-/**
- * This function returns the parsed url parameters of the
- * current request. Multiple values per key are supported,
- * it will always return arrays of strings for the value parts.
- */
-jQuery.getQueryParameters = function(s) {
- if (typeof s == 'undefined')
- s = document.location.search;
- var parts = s.substr(s.indexOf('?') + 1).split('&');
- var result = {};
- for (var i = 0; i < parts.length; i++) {
- var tmp = parts[i].split('=', 2);
- var key = jQuery.urldecode(tmp[0]);
- var value = jQuery.urldecode(tmp[1]);
- if (key in result)
- result[key].push(value);
- else
- result[key] = [value];
- }
- return result;
-};
-
-/**
- * highlight a given string on a jquery object by wrapping it in
- * span elements with the given class name.
- */
-jQuery.fn.highlightText = function(text, className) {
- function highlight(node) {
- if (node.nodeType == 3) {
- var val = node.nodeValue;
- var pos = val.toLowerCase().indexOf(text);
- if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) {
- var span = document.createElement("span");
- span.className = className;
- span.appendChild(document.createTextNode(val.substr(pos, text.length)));
- node.parentNode.insertBefore(span, node.parentNode.insertBefore(
- document.createTextNode(val.substr(pos + text.length)),
- node.nextSibling));
- node.nodeValue = val.substr(0, pos);
- }
- }
- else if (!jQuery(node).is("button, select, textarea")) {
- jQuery.each(node.childNodes, function() {
- highlight(this);
- });
- }
- }
- return this.each(function() {
- highlight(this);
- });
-};
-
-/*
- * backward compatibility for jQuery.browser
- * This will be supported until firefox bug is fixed.
- */
-if (!jQuery.browser) {
- jQuery.uaMatch = function(ua) {
- ua = ua.toLowerCase();
-
- var match = /(chrome)[ \/]([\w.]+)/.exec(ua) ||
- /(webkit)[ \/]([\w.]+)/.exec(ua) ||
- /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) ||
- /(msie) ([\w.]+)/.exec(ua) ||
- ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) ||
- [];
-
- return {
- browser: match[ 1 ] || "",
- version: match[ 2 ] || "0"
- };
- };
- jQuery.browser = {};
- jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true;
-}
-
-/**
- * Small JavaScript module for the documentation.
- */
-var Documentation = {
-
- init : function() {
- this.fixFirefoxAnchorBug();
- this.highlightSearchWords();
- this.initIndexTable();
-
- },
-
- /**
- * i18n support
- */
- TRANSLATIONS : {},
- PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; },
- LOCALE : 'unknown',
-
- // gettext and ngettext don't access this so that the functions
- // can safely bound to a different name (_ = Documentation.gettext)
- gettext : function(string) {
- var translated = Documentation.TRANSLATIONS[string];
- if (typeof translated == 'undefined')
- return string;
- return (typeof translated == 'string') ? translated : translated[0];
- },
-
- ngettext : function(singular, plural, n) {
- var translated = Documentation.TRANSLATIONS[singular];
- if (typeof translated == 'undefined')
- return (n == 1) ? singular : plural;
- return translated[Documentation.PLURALEXPR(n)];
- },
-
- addTranslations : function(catalog) {
- for (var key in catalog.messages)
- this.TRANSLATIONS[key] = catalog.messages[key];
- this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')');
- this.LOCALE = catalog.locale;
- },
-
- /**
- * add context elements like header anchor links
- */
- addContextElements : function() {
- $('div[id] > :header:first').each(function() {
- $('\u00B6').
- attr('href', '#' + this.id).
- attr('title', _('Permalink to this headline')).
- appendTo(this);
- });
- $('dt[id]').each(function() {
- $('\u00B6').
- attr('href', '#' + this.id).
- attr('title', _('Permalink to this definition')).
- appendTo(this);
- });
- },
-
- /**
- * workaround a firefox stupidity
- * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075
- */
- fixFirefoxAnchorBug : function() {
- if (document.location.hash)
- window.setTimeout(function() {
- document.location.href += '';
- }, 10);
- },
-
- /**
- * highlight the search words provided in the url in the text
- */
- highlightSearchWords : function() {
- var params = $.getQueryParameters();
- var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : [];
- if (terms.length) {
- var body = $('div.body');
- if (!body.length) {
- body = $('body');
- }
- window.setTimeout(function() {
- $.each(terms, function() {
- body.highlightText(this.toLowerCase(), 'highlighted');
- });
- }, 10);
- $('