diff --git a/docs/_build/doctrees/algorithms/vpg.doctree b/docs/_build/doctrees/algorithms/vpg.doctree index 473f8b342..6fbf066b7 100644 Binary files a/docs/_build/doctrees/algorithms/vpg.doctree and b/docs/_build/doctrees/algorithms/vpg.doctree differ diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 36974c30e..021f8250b 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/doctrees/spinningup/keypapers.doctree b/docs/_build/doctrees/spinningup/keypapers.doctree index 92656eb40..3730e4cf9 100644 Binary files a/docs/_build/doctrees/spinningup/keypapers.doctree and b/docs/_build/doctrees/spinningup/keypapers.doctree differ diff --git a/docs/_build/doctrees/spinningup/rl_intro.doctree b/docs/_build/doctrees/spinningup/rl_intro.doctree index c25de19db..b7a7d0f44 100644 Binary files a/docs/_build/doctrees/spinningup/rl_intro.doctree and b/docs/_build/doctrees/spinningup/rl_intro.doctree differ diff --git a/docs/_build/doctrees/user/algorithms.doctree b/docs/_build/doctrees/user/algorithms.doctree index 6da233a8b..095bea7d9 100644 Binary files a/docs/_build/doctrees/user/algorithms.doctree and b/docs/_build/doctrees/user/algorithms.doctree differ diff --git a/docs/_build/doctrees/user/introduction.doctree b/docs/_build/doctrees/user/introduction.doctree index 85f72dd01..92c453c21 100644 Binary files a/docs/_build/doctrees/user/introduction.doctree and b/docs/_build/doctrees/user/introduction.doctree differ diff --git a/docs/_build/html/_modules/spinup/algos/ddpg/ddpg.html b/docs/_build/html/_modules/spinup/algos/ddpg/ddpg.html index 796b3a8d0..a5434d7f6 100644 --- a/docs/_build/html/_modules/spinup/algos/ddpg/ddpg.html +++ b/docs/_build/html/_modules/spinup/algos/ddpg/ddpg.html @@ -368,7 +368,7 @@
logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})
def get_action(o, noise_scale):
- a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})
+ a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
a += noise_scale * np.random.randn(act_dim)
return np.clip(a, -act_limit, act_limit)
diff --git a/docs/_build/html/_modules/spinup/algos/sac/sac.html b/docs/_build/html/_modules/spinup/algos/sac/sac.html
index 50bc4e73f..2c8bf030f 100644
--- a/docs/_build/html/_modules/spinup/algos/sac/sac.html
+++ b/docs/_build/html/_modules/spinup/algos/sac/sac.html
@@ -404,7 +404,7 @@ Source code for spinup.algos.sac.sac
def get_action(o, deterministic=False):
act_op = mu if deterministic else pi
- return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})
+ return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]
def test_agent(n=10):
global sess, mu, pi, q1, q2, q1_pi, q2_pi
diff --git a/docs/_build/html/_modules/spinup/algos/td3/td3.html b/docs/_build/html/_modules/spinup/algos/td3/td3.html
index 5ab542057..b9a11a2c3 100644
--- a/docs/_build/html/_modules/spinup/algos/td3/td3.html
+++ b/docs/_build/html/_modules/spinup/algos/td3/td3.html
@@ -394,7 +394,7 @@ Source code for spinup.algos.td3.td3
logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})
def get_action(o, noise_scale):
- a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})
+ a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
a += noise_scale * np.random.randn(act_dim)
return np.clip(a, -act_limit, act_limit)
diff --git a/docs/_build/html/_modules/spinup/utils/run_utils.html b/docs/_build/html/_modules/spinup/utils/run_utils.html
index 89ac25c95..156327998 100644
--- a/docs/_build/html/_modules/spinup/utils/run_utils.html
+++ b/docs/_build/html/_modules/spinup/utils/run_utils.html
@@ -355,7 +355,7 @@ Source code for spinup.utils.run_utils
encoded_thunk = base64.b64encode(zlib.compress(pickled_thunk)).decode('utf-8')
entrypoint = osp.join(osp.abspath(osp.dirname(__file__)),'run_entrypoint.py')
- cmd = ['python', entrypoint, encoded_thunk]
+ cmd = [sys.executable if sys.executable else 'python', entrypoint, encoded_thunk]
try:
subprocess.check_call(cmd, env=os.environ)
except CalledProcessError:
diff --git a/docs/_build/html/_sources/algorithms/vpg.rst.txt b/docs/_build/html/_sources/algorithms/vpg.rst.txt
index 1b612bb9c..bc4e06a2e 100644
--- a/docs/_build/html/_sources/algorithms/vpg.rst.txt
+++ b/docs/_build/html/_sources/algorithms/vpg.rst.txt
@@ -40,7 +40,7 @@ The policy gradient algorithm works by updating policy parameters via stochastic
\theta_{k+1} = \theta_k + \alpha \nabla_{\theta} J(\pi_{\theta_k})
-Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise use the finite-horizon undiscounted policy gradient formula.
+Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise using the finite-horizon undiscounted policy gradient formula.
Exploration vs. Exploitation
----------------------------
diff --git a/docs/_build/html/_sources/spinningup/keypapers.rst.txt b/docs/_build/html/_sources/spinningup/keypapers.rst.txt
index 1961fe4de..d3dcc8fcf 100644
--- a/docs/_build/html/_sources/spinningup/keypapers.rst.txt
+++ b/docs/_build/html/_sources/spinningup/keypapers.rst.txt
@@ -191,7 +191,7 @@ a. Model is Learned
.. [#] `Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning `_, Nagabandi et al, 2017. **Algorithm: MBMF.**
-.. [#] `Model-Based Value Estimation for Efficient Model-Free Reinforcement Learning `_, Feinberg et al, 2018. **Algorithm: MBVE.**
+.. [#] `Model-Based Value Expansion for Efficient Model-Free Reinforcement Learning `_, Feinberg et al, 2018. **Algorithm: MVE.**
.. [#] `Sample-Efficient Reinforcement Learning with Stochastic Ensemble Value Expansion `_, Buckman et al, 2018. **Algorithm: STEVE.**
diff --git a/docs/_build/html/_sources/spinningup/rl_intro.rst.txt b/docs/_build/html/_sources/spinningup/rl_intro.rst.txt
index 2a498a09f..2c159ca09 100644
--- a/docs/_build/html/_sources/spinningup/rl_intro.rst.txt
+++ b/docs/_build/html/_sources/spinningup/rl_intro.rst.txt
@@ -22,7 +22,7 @@ RL methods have recently enjoyed a wide variety of successes. For example, it's
.. raw:: html
-
Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise using the finite-horizon undiscounted policy gradient formula.
Exploration vs. Exploitation¶
diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js
index d42eb92db..d9066f730 100644
--- a/docs/_build/html/searchindex.js
+++ b/docs/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["algorithms/ddpg","algorithms/ppo","algorithms/sac","algorithms/td3","algorithms/trpo","algorithms/vpg","etc/acknowledgements","etc/author","index","spinningup/bench","spinningup/exercise2_1_soln","spinningup/exercise2_2_soln","spinningup/exercises","spinningup/extra_pg_proof1","spinningup/extra_pg_proof2","spinningup/keypapers","spinningup/rl_intro","spinningup/rl_intro2","spinningup/rl_intro3","spinningup/rl_intro4","spinningup/spinningup","user/algorithms","user/installation","user/introduction","user/plotting","user/running","user/saving_and_loading","utils/logger","utils/mpi","utils/plotter","utils/run_utils"],envversion:51,filenames:["algorithms/ddpg.rst","algorithms/ppo.rst","algorithms/sac.rst","algorithms/td3.rst","algorithms/trpo.rst","algorithms/vpg.rst","etc/acknowledgements.rst","etc/author.rst","index.rst","spinningup/bench.rst","spinningup/exercise2_1_soln.rst","spinningup/exercise2_2_soln.rst","spinningup/exercises.rst","spinningup/extra_pg_proof1.rst","spinningup/extra_pg_proof2.rst","spinningup/keypapers.rst","spinningup/rl_intro.rst","spinningup/rl_intro2.rst","spinningup/rl_intro3.rst","spinningup/rl_intro4.rst","spinningup/spinningup.rst","user/algorithms.rst","user/installation.rst","user/introduction.rst","user/plotting.rst","user/running.rst","user/saving_and_loading.rst","utils/logger.rst","utils/mpi.rst","utils/plotter.rst","utils/run_utils.rst"],objects:{"":{"--ac_kwargs":[25,4,1,"cmdoption-act"],"--act":[25,4,1,"cmdoption-act"],"--count":[24,4,1,"cmdoption-count"],"--cpu":[25,4,1,"cmdoption-cpu"],"--data_dir":[25,4,1,"cmdoption-data-dir"],"--datestamp":[25,4,1,"cmdoption-datestamp"],"--deterministic":[26,4,1,"cmdoption-d"],"--env":[25,4,1,"cmdoption-env"],"--env_name":[25,4,1,"cmdoption-env"],"--episodes":[26,4,1,"cmdoption-n"],"--exclude":[24,4,1,"cmdoption-exclude"],"--exp_name":[25,4,1,"cmdoption-exp-name"],"--hid":[25,4,1,"cmdoption-hid"],"--itr":[26,4,1,"cmdoption-i"],"--legend":[24,4,1,"cmdoption-l"],"--len":[26,4,1,"cmdoption-l"],"--norender":[26,4,1,"cmdoption-nr"],"--num_cpu":[25,4,1,"cmdoption-cpu"],"--select":[24,4,1,"cmdoption-select"],"--smooth":[24,4,1,"cmdoption-s"],"--value":[24,4,1,"cmdoption-y"],"--xaxis":[24,4,1,"cmdoption-x"],"-d":[26,4,1,"cmdoption-d"],"-i":[26,4,1,"cmdoption-i"],"-l":[26,4,1,"cmdoption-l"],"-n":[26,4,1,"cmdoption-n"],"-nr":[26,4,1,"cmdoption-nr"],"-s":[24,4,1,"cmdoption-s"],"-x":[24,4,1,"cmdoption-x"],"-y":[24,4,1,"cmdoption-y"],"default":[26,4,1,"cmdoption-i"],logdir:[24,4,1,"cmdoption-arg-logdir"]},"spinup.utils":{mpi_tf:[28,3,0,"-"],mpi_tools:[28,3,0,"-"]},"spinup.utils.logx":{EpochLogger:[27,1,1,""],Logger:[27,1,1,""],restore_tf_graph:[27,0,1,""]},"spinup.utils.logx.EpochLogger":{get_stats:[27,2,1,""],log_tabular:[27,2,1,""],store:[27,2,1,""]},"spinup.utils.logx.Logger":{__init__:[27,2,1,""],dump_tabular:[27,2,1,""],log:[27,2,1,""],log_tabular:[27,2,1,""],save_config:[27,2,1,""],save_state:[27,2,1,""],setup_tf_saver:[27,2,1,""]},"spinup.utils.mpi_tf":{MpiAdamOptimizer:[28,1,1,""],sync_all_params:[28,0,1,""]},"spinup.utils.mpi_tf.MpiAdamOptimizer":{apply_gradients:[28,2,1,""],compute_gradients:[28,2,1,""]},"spinup.utils.mpi_tools":{mpi_avg:[28,0,1,""],mpi_fork:[28,0,1,""],mpi_statistics_scalar:[28,0,1,""],num_procs:[28,0,1,""],proc_id:[28,0,1,""]},"spinup.utils.run_utils":{ExperimentGrid:[30,1,1,""],call_experiment:[30,0,1,""],setup_logger_kwargs:[30,0,1,""]},"spinup.utils.run_utils.ExperimentGrid":{add:[30,2,1,""],print:[30,2,1,""],run:[30,2,1,""],variant_name:[30,2,1,""],variants:[30,2,1,""]},spinup:{ddpg:[0,0,1,""],ppo:[1,0,1,""],sac:[2,0,1,""],td3:[3,0,1,""],trpo:[4,0,1,""],vpg:[5,0,1,""]}},objnames:{"0":["py","function","Python function"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","module","Python module"],"4":["std","cmdoption","program option"]},objtypes:{"0":"py:function","1":"py:class","2":"py:method","3":"py:module","4":"std:cmdoption"},terms:{"11th":[],"128_ac":25,"13th":20,"1_h32":25,"1_simple_pg":18,"2_rtg_pg":18,"32_seed10":25,"80s":21,"90s":21,"abstract":[15,20],"break":[0,3,12,13,14,18,20,23],"case":[1,9,12,13,14,16,17,18,20,22,25,26,27],"class":[8,15,21,28,30],"default":[9,12,22,24,25,26,27,30],"final":[2,4,16,20,21,22,27],"float":[0,1,2,3,4,5],"function":[0,1,2,3,4,5,9,10,11,13,15,17,18,20,21,25,27,28,30],"import":[11,15,16,17,18,20,22,23,25,26,27],"int":[0,1,2,3,4,5,18,24,25,26,27,28,30],"long":[1,11,18,20,21,22,23,25,26],"new":[1,4,17,18,20,22,23],"public":23,"return":[0,1,2,3,4,5,9,11,12,14,17,18,20,26,27,28,30],"rockt\u00e4schel":20,"short":[18,20,23,25],"super":[15,17,20],"throw":17,"true":[0,11,18,25,27,28,30],"try":[0,3,4,12,16,20,22,25,26],"var":[26,27],"while":[1,2,3,4,16,17,18,20,22,23,27],AIs:16,AWS:20,Adding:11,And:[0,18,20,22,23],Ape:15,Are:[12,15],But:[0,1,2,4,11,13,16,17,18,20,21,24,26,27],CTS:15,Doing:8,For:[0,2,3,16,17,18,20,23,25,26,27,28,30],Its:16,Not:[12,18],One:[0,3,13,15,16,17,18,23],RHS:2,That:[2,16,17,18,20,23],The:[1,2,3,4,5,8,9,10,12,13,14,15,17,18,23,24,25,26,27,28,30],Then:[0,13,18,20,22,27],There:[0,1,12,16,17,18,20,22,23,25,26,30],These:[8,12,16,20,25],Use:[12,17,18,20,26,27],Used:30,Uses:30,Using:[0,1,3,4,5,8,13,15],Will:27,With:[0,2,10,15,22,27],__init__:27,__main__:[25,27],__name__:[25,27],_h128:25,_h300_ac:25,a2c:[17,18,20],a3c:[15,17,20],a_ph:[0,1,2,3,4,5],a_t:18,abbeel:[6,7,15],abil:13,abl:[0,10,16,17,18,21],ablat:[20,30],about:[6,8,12,13,16,17,18,20,21,22,23,26,27,30],abov:4,absenc:[16,21,25],absent:2,absolut:[3,16],ac_kwarg:[0,1,2,3,4,5,25],academ:20,acc:27,acceler:[2,15,20],accept:[16,25,30],access:[0,1,2,3,4,5,16,17,20],accident:1,accord:[0,1,2,3,4,5,16,17,18,21],account:25,accru:18,accur:[4,17],accuraci:27,acer:15,achiam:[7,15,20],achiev:[3,16,20,23],acknowledg:8,acktr:[15,20],acquaint:16,across:[2,4,9,15,24,25,27,28],act:[2,14,16,17,18,25,26],act_dim:[0,1,2,3,4,5,11,16],act_limit:11,act_nois:[0,3],act_ph:18,action:[0,1,2,3,4,5,9,13,14,17,18,20,21,26,27],action_mask:18,action_spac:11,activ:[11,16,22,25,27,28],actor:[3,8,11,12,15,17,20,21,23,25],actor_crit:[0,1,2,3,4,5,21],actrelu:25,acttanh:25,actual:[2,4,11,16,18,20,24,30],adam:[5,20,28],adamoptim:[27,28],adapt:[0,15],add:[0,3,4,11,18,25,26,27,30],add_argu:25,added:[0,3,11,25,30],adding:3,addit:[0,16,28],addition:23,address:[3,15,23],adher:21,adjust:[1,4,16],advanc:[17,20],advantag:[1,4,5,12,15,17,18],advent:21,adversari:15,advic:20,advis:[7,18],after:[0,1,2,3,13,14,16,17,18,20,21,22,23,25,26,27,28],afterward:17,again:[1,13,18,26],against:[12,20],agent:[0,1,2,3,4,5,9,10,12,15,16,17,18,20,21,22,23,26,27],agi:23,agnost:15,ahead:17,aim:[16,18,20,24],alex:[6,20],algo1:24,algo2:24,algo:[4,12,25,26],algo_nam:25,algorithm:[0,1,2,3,4,5,8,9,16,18,20,22,23,24,25,27],align:[15,20],all:[0,1,2,3,9,12,15,16,17,18,20,21,23,24,25,26,27,28,30],allow:[0,1,2,4,13,16,17,18,22,25,30],almost:[2,3,4,11,16,17,18,20,23,28],alon:17,along:[3,12,20,22],alongsid:26,alpha:2,alphazero:[15,17],alreadi:[1,4,5,20,23],also:[0,1,2,4,6,11,16,17,18,20,21,22,23,25,26,27,28,30],altern:[2,15,16,20],although:[1,16,18,21,25],altogeth:21,alwai:[0,1,2,3,4,5,16,17,18,20,21],amanda:6,ambigu:23,amen:25,amend:30,amodei:15,among:21,amount:[1,4,5,20],anaconda:22,analag:14,analys:20,analysi:[1,8,20],analyt:[4,18],andrej:20,andrychowicz:[15,17],anecdot:12,angl:[16,18,20],ani:[0,1,2,3,4,5,12,16,17,18,20,21,22,23,24,25,26,27],announc:23,anonym:15,anoth:[2,3,11,16,17,20,22,26],answer:[4,23],ant:25,anthoni:15,anticip:23,anymor:1,anyon:22,anyth:[12,16,26,27],anywher:[18,27],api:[0,1,2,3,4,5],appeal:[16,18],appear:[12,20,22,25],append:18,appendix:2,appli:[13,15,16],applic:12,apply_gradi:28,approach:[0,2,15,17,18,20,23],approch:20,appropri:[0,1,2,3,4,5,16],approxim:[0,1,2,3,4,5,15,17,18,20,21,23],april:23,apt:22,arang:11,arbitrari:[2,15,16,27],architectur:[9,15,20],area:21,aren:[17,20,23],arg:[0,1,2,3,4,5,24,25,26,27,30],argmax:27,argpars:25,argument:[0,2,3,16,21,22,24,25,27,28,30],argumentpars:25,aris:[3,23],around:[4,22],arrai:[11,18,28],arriv:5,art:20,articl:[18,20],artifact:4,as_list:11,ascent:[0,5,17,18],ask:[23,27],askel:6,aspect:20,aspir:20,assertionerror:26,assum:[0,2,16,18,20,30],assumpt:11,asynchron:[15,17,20],atari:[15,16,20],atom:17,attent:[15,20],attribut:20,augment:[15,17],author:[0,2,8,18],auto:[20,25,30],autocomplet:24,automat:[1,12,24,25,30],autonom:15,auxiliari:15,avail:[12,16,17,20,22,25,30],averag:[0,2,3,4,9,10,11,12,16,18,24,27,28],average_onli:27,averageepret:[24,26],averagetest:27,averagetestepret:24,avert:3,avoid:[4,16,17,20,30],awai:[1,20],awar:[18,20,23],axi:[11,18,24,27],back:[4,13,21],background:[8,12,15,17,18,22,23],backtrack:4,backtrack_coeff:4,backtrack_it:4,backup:[2,11,12,16],bad:[2,4,11,16,17,20],bahdanau:20,ballpark:[12,20],bandwidth:23,bar:12,barri:[],barrier:23,barto:17,base:[0,2,3,5,8,18,20,21,23,25,27,28,30],baselin:[0,1,3,4,8,20,23,28],basi:[18,20],basic:[1,8,14,16,17,20,21,23,25,27],batch:[0,1,2,3,4,5,9,11,12,18,20],batch_act:18,batch_len:18,batch_loss:18,batch_ob:18,batch_ret:18,batch_siz:[0,2,3,18],batch_weight:18,bay:20,bear:18,beat:20,becaus:[0,1,3,4,5,11,12,13,14,16,17,18,20,21,22,23,25,26,30],becom:[1,4,5,11,17,20,23,25],beefi:20,been:[0,2,6,16,17,20,22,23,25,27],befor:[0,2,3,13,15,18,20,25,27],begin:[0,2,3,12,18,23],behav:[17,27],behavior:[0,1,3,11,15,16,20,25,30],behaviour:[1,15],behind:[0,16],being:[0,13,16,17,18,20,23,25],believ:23,bellemar:[15,17],bellman:[0,2,3,11,12,17,21],belong:18,below:[17,22,24,26],ben:6,bench:25,bench_algo1:24,bench_algo2:24,bench_algo:24,bench_ppo_cartpol:25,benchmark:[5,8,15,17,20,22,23],benefit:[1,2,20,21,23],berkelei:[6,7,20],best:[20,23,25],beta:6,better:[0,2,3,16,17,18,20],between:[0,1,2,3,4,5,11,12,13,15,16,17,20,23,24,30],bewar:20,beyond:[1,17,20,23],bia:17,bias:16,big:20,biggest:[1,17,20],bind:28,bind_to_cor:28,bit:[1,2,16,17,20,28],block:18,blog:[20,23],blundel:15,blur:16,bolt:20,bonu:[2,8,12],bonus:2,bool:[25,27,28,30],bootcamp:20,bootstrap:17,both:[1,2,3,4,5,9,16,18,20,21,23,24,27],bottleneck:15,bound:2,boundari:20,box2d:[22,26],boyd:4,bracket:25,brain:[16,20],branch:17,breakthrough:[16,20],brew:22,bridg:[2,15],brittl:[3,20,21],broad:[20,23],broadcast:11,broken:[13,20],brought:15,brundag:6,buckman:15,buffer:[0,2,3,21,26],bug:[20,22,23],bugged_mlp_actor_crit:11,build:[11,16,18,20,21,23,25],built:[8,16,25],bunch:1,bundl:20,burda:15,c51:[15,17],cabi:15,calcul:[0,3,4,13,18,20,21,27],calculu:[18,20],call:[0,8,16,17,18,20,22,25,26,27,28],call_experi:[25,30],callabl:[25,30],can:[0,1,2,3,4,5,8,11,12,13,14,17,18,20,21,22,23,24,25,26,27,30],cancel:11,candid:[17,24],cannot:18,care:[0,2,18,20,21,23,26],carefulli:17,carlo:17,cartpol:[20,25],cash:16,cast:27,castelao:6,categor:[17,18],catherin:20,caus:[1,4,5,17],causal:15,caution:1,ceil:1,center:[6,20],central:[2,16,23],certain:20,cg_iter:4,chain:[13,18,20,23],challeng:[0,8,17,20,23],chanc:[2,13,20],chang:[1,2,3,16,17,18,20,23,25,26,30],channel:15,chapter:[4,5,17,18],charact:[15,16,25],characterist:20,chart:20,check:[1,8,11,12,20,25,26],chess:15,choic:[0,2,12,13,16,17,18,20,23,25,27],choos:[12,16,17,20,30],chose:21,chri:20,christi:6,christiano:15,chung:20,cinch:26,circumst:[17,20],citat:[17,20],claim:[13,14,15,18,20],clark:6,classic:[5,8,17,20,22],classifi:16,clavera:15,clean:[22,27],clear:[5,16,20,23,27],clearli:[13,23],cli:25,click:25,clip:[1,2,3,15,20],clip_ratio:[1,25],clock:12,clone:22,close:[0,1,2,3,4,5,8,21,23,30],closer:23,cloud:20,cluster:17,cmd:25,code:[0,1,4,6,8,15,16,18,20,25],coeffici:[1,2,4],cohort:23,coin:2,coincid:4,colin:6,collaps:[1,4],collect:[0,9,17,18,20],colon:30,color:27,column:[20,24],com:22,combin:[18,23,25],come:[0,2,11,13,16,18,20,22,23,26],comfort:[1,4,12,20],command:[4,8,12,21,22],commit:23,common:[3,16,18,20,23,25],commun:[22,23],compar:[0,5,12,20,24],comparison:[20,23],compat:[6,11,21,23],competit:15,complet:[11,13,16,17,20,21,22,23,25,26],complex:[1,2,20],complic:30,composit:[0,2,3],comprehens:15,compu:[],comput:[0,1,2,3,4,5,11,13,16,17,18,20,21,23,27],compute_gradi:28,concat:11,concept:[8,17,20],conceptu:[18,20],conclud:13,conclus:20,concret:15,concurr:[0,2,3,17,18,20,21],conda:22,condit:[0,1,2,3,4,5,13,14,16,18,27],confer:20,confid:20,config:[24,26,27,30],configur:[25,26,27,30],confound:20,confus:[16,22],conjug:4,connect:[0,2,4,16,17,18,21,23,25],consequ:[16,18,23],conserv:17,consid:[11,12,13,17,18,20,25,26,28],consider:1,consist:[16,17,20,23],constant:[0,11,13,14],constrain:[1,15],constraint:[1,4,16],construct:[18,25,30],contain:[0,2,4,5,11,15,20,21,23,24,26,27,28,30],contest:20,context:[16,17,20],continu:[0,1,2,3,4,5,15,16,18,20,22],contrast:17,contribut:[1,6,15,20,23],control:[0,1,2,4,5,15,16,17,20,22,25],conv:20,conveni:[16,25,30],convent:[0,16,23],converg:[2,15,16],convert:[16,25],convex:4,convolut:[16,20],copi:[0,1,2,3,4,5,18,25,26,27],core:[8,12,16,18,20],correct:[2,4,11,12,20,24,25,26],correctli:[12,18,20,22],correl:0,correspond:[0,1,2,16,17,22],cost:[4,15],could:[2,12,16,18,20],count:[15,20,24,28],counter:15,coupl:[17,20],cours:[0,1,2,3,4,5,9,20],covari:[16,20],cover:[17,18,26],cpo:15,cpu:[20,25,30],crash:26,crater:18,creat:[0,1,2,3,4,5,12,16,17,18,20,22],credit:20,crisp:15,criteria:12,critic:[1,3,4,5,8,11,12,15,16,17,20,21,23,25],critiqu:8,crucial:16,cs231n:20,csaba:20,culmin:21,cumul:[16,20],curat:23,curios:15,current:[0,1,2,3,4,5,16,17,18,20,22,23,25,26,27,28],curriculum:[20,23],curv:[10,11,20,24],custom:25,d_kl:4,d_ph:[11,12],dabnei:[15,17],dai:[13,20,22],dalal:15,damp:[3,4],damping_coeff:4,danger:4,daniel:6,data:[0,1,3,4,15,17,18,20,21,22,24,25,26,27],data_dir:[25,30],dataset:[18,27],date:[25,30],datestamp:[25,30],dd_:25,dd_exp_nam:30,dd_hh:[25,30],dd_ppo_ant_cli0:25,ddpg:[2,8,9,11,15,17,20,21,23,25],deadli:17,deal:[0,4,12,13,16],debug:20,decai:20,decid:[16,17,22,30],decis:[13,14,16,20,23],decompos:13,decreas:1,deep:[2,5,7,12,16,17,18,21,22,23,25],deepli:4,deepmim:15,def:[11,18,27],default_data_dir:[25,26,30],default_shorthand:30,deficit:21,defin:[2,14,16,17,18,21,27],definit:[2,14,16,20,21,22],degrad:[12,20],delai:[0,8,17,21],delta:4,dennison:6,denot:[0,4,5,11,16],dens:[16,27],densiti:[2,15],deped:15,depend:[0,1,2,4,5,6,11,12,14,16,18,20,21,23],depth:18,deriv:[1,8,13],descend:21,descent:[0,1,4,5,9,15,18,20,23],describ:[0,1,4,16,17,18,22,25,27],descript:[0,1,2,3,4,5,15,16,17,18,22,26],deserv:20,design:[3,8,17,20,21,27],desir:[13,17,18,20,27],despit:[5,18,20],detail:[0,1,2,8,11,18,20,23,26,27],deter:20,determin:23,determinist:[2,3,8,9,17,21,26],dev:[2,9,22],develop:[3,7,8,17,23],deviat:[2,16,20,27],devin:6,dexter:15,diagnos:20,diagnost:[21,27],diagon:12,diayn:15,dict:[0,1,2,3,4,5,25,26,27,30],dictionari:27,did:[2,15,22],didn:13,differ:[1,2,3,4,10,11,12,15,16,18,20,23,24,25,26],differenti:[0,2],dig:[16,23,26],digest:[17,23],dimens:[3,11],dimension:[1,4,5,15,16],direct:2,directli:[0,4,16,17,20,21,25,30],directori:[8,24,25,27,30],disabl:30,discard:17,disclaim:17,discount:[0,1,2,3,4,5,14,16,18],discourag:20,discov:20,discoveri:15,discret:[0,1,2,4,5,16],discrimin:15,discuss:[16,18,23,25],disguis:20,dishearten:20,distanc:4,distil:[15,17],distinct:[13,16],distract:8,distribut:[0,2,3,4,12,13,16,17,18,20,23,26],dive:[4,20],diverg:[1,4],divers:15,doc:[8,23,25,26,28],docstr:25,document:[8,15,16,22,23,25,26,27,28,29],doe:[0,1,2,3,4,13,16,18,20,26],doesn:[0,1,4,11,13,18,20,26],doing:[1,4,20,21],domain:20,don:[4,8,11,12,16,17,20,21,22,23,26],done:[2,11,18,20,23],donoghu:15,dopamin:15,dota:16,doubl:[2,3,15,26],down:[0,4,5,13,18,26],download:22,downsid:17,dpg:[0,15],dqn:[0,15,17,20],dramat:[1,3],drastic:12,draw:17,drawn:2,dream:17,drive:20,driven:15,drop:[13,18],dropout:20,dry:22,dtype:[16,18],dualiti:4,duan:[5,15,20],due:[2,4],duel:15,dump_tabular:27,dure:[4,17,18],dylan:6,dynam:15,each:[0,1,2,3,4,5,8,11,12,13,16,17,18,20,21,23,24,25,26,27,28,30],earli:[1,10,13,15,18,20,21],earlier:[0,5,15],eas:18,easi:[17,20,23,25,26,27,28],easier:[1,16,17,20,23,30],easiest:4,easili:[24,25,26,27],educ:23,effect:[12,17,18,20],effici:[0,4,15,17,20,21],effort:[12,17,20],eglp:[13,18],either:[0,1,2,3,4,5,16,17,20,23,24,25,30],element:[20,21,27],elementwis:16,els:[11,18,20,27],elu:25,emb:17,embed:[15,17,27],embrac:20,emerg:[1,15],emphasi:20,empir:[1,15,18,20,21],emploi:0,empow:23,empti:[18,26],enabl:25,encod:[18,20],encompass:17,encourag:[1,4,5,23],end:[1,18,20,23,26,27],endless:23,endow:17,enforc:[16,20],engag:20,engin:[20,22,23],enjoi:[16,18,20],enough:[0,3,11,18,20,21,22,23,25],ensembl:15,ensur:[1,2,23,25],enter:20,entir:20,entireti:[12,14,18],entri:[4,11,16,21,23],entropi:[15,17,20],env:[12,18,22,25,26],env_fn:[0,1,2,3,4,5,25,30],env_nam:[25,30],environ:[0,1,2,3,4,5,8,11,12,15,16,17,18,20,21,22,27,30],ep_len:18,ep_ret:18,ep_rew:18,episod:[0,1,2,3,4,5,12,15,16,18,20,23,26],eplen:26,epoch:[0,1,2,3,4,5,9,12,18,25,26,27],epoch_logg:27,epochlogg:[0,1,2,3,4,5,27],epret:26,epsilon:20,equal:[4,14,18,20,27],equat:[17,18,20,21],equival:[1,2,4,5,15,17,18,25],ermon:15,error:[0,2,3,4,11,15,16,18,23,25],escap:25,especi:[4,17,20,22],espeholt:15,essai:[12,17,23],essenti:[3,16,25,27],establish:[0,15,20],estim:[0,1,2,3,4,5,12,15,16,17,18,20],etc:[8,17,26],eval:25,evalu:[0,2,4,12,18,20,26],even:[0,4,17,18,20,25],evenli:2,ever:[0,16,18,26],everi:[0,2,3,9,13,16,20,23,25],everyon:23,everyth:[0,14,20,22,26,27],everywher:11,evolut:15,evolv:20,ex2:[12,15],exact:23,exactli:[3,4,18,20],examin:18,exampl:[11,13,15,16,17,18,20,22,23,25,30],exc:24,except:[2,12,20,22,23,25,28],excerpt:[11,12],excit:[20,23],exclud:24,exclus:24,execut:[17,22,25,30],exemplar:15,exercis:[4,8,23],exercise1_1:12,exercise1_1_soln:12,exercise1_2:12,exercise1_2_soln:12,exercise1_3:12,exercise2_2:12,exhaust:[0,6,17],exist:[17,20],exit:[15,17],exp_nam:[12,22,24,25,27,30],exp_name_:30,expand:[2,4,13],expans:[4,15,17,20],expect:[2,5,8,11,13,14,16,17,20,22,26,27],expens:[0,4],experi:[0,2,8,12,15,17,18,20,21,23,24,27],experienc:0,experiment:25,experiment_nam:25,experimentgrid:8,expert:[17,23],expertis:23,explain:[0,2,12,18,20],explan:[16,22],explicit:17,explicitli:[2,17],exploit:[17,21],explor:[7,8,17,20],exponenti:16,expos:[17,20,21,23],express:[1,4,13,14,16,18],extens:[17,18,22],extent:12,extrem:[17,18,22],eye:20,eysenbach:15,face:20,facebook:15,facilit:[0,3,15,25],fact:[16,18,25],facto:22,factor:[0,1,2,3,4,5,12,15,16],fail:[11,17,20,26],failur:[3,8,17,20],fair:[0,16,20],fairli:[0,20],fals:[0,18,27,28,30],famili:[1,15,16,17,21],familiar:20,famou:17,famous:16,fantast:23,far:[1,4,15,16,17,18,20],farquhar:20,farther:[1,13],fast:[15,20,26],faster:18,favor:21,featur:[2,21,26],feed_dict:[18,27],feedback:[6,23],feedforward:18,feel:[1,4,12,18,20,25],feinberg:[15,17],fellow:23,fend:3,fernando:15,fertil:20,feudal:15,few:[1,12,13,16,17,18,20,23,25,26,28,30],fewer:1,fictit:17,fictiti:17,fidel:15,field:[15,17,20,21,23],figur:[1,12,17,20,22,24],file:[12,24,25,26,27,30],filepath:27,find:[0,3,12,15,18,20,24,25,26],fine:[15,17,20],finish:[12,20,22],finished_rendering_this_epoch:18,finit:[0,2,5,14,16,18],finn:15,first:[0,1,2,3,14,16,17,18,20,22,23,27,30],fit:[0,17],five:9,fix:[0,2,3,14,16,17,18,23,24],flag:[12,20,24,26],flatten:[1,4,5],float32:[16,18,27],flood:26,flow:15,focu:[1,20],focus:20,foerster:20,folder:[25,26,30],follow:[0,2,12,14,15,16,17,18,20,21,22,23,26,27],follw:22,forc:[25,30],force_datestamp:[25,30],forego:[16,17],forev:16,forget:26,form:[1,2,3,8,13,14,20,27,30],formal:8,format:[8,26],formul:[15,16,18],formula:[5,12,18],forward:20,found:[0,1,4,5,18,22,23],foundat:[4,15,17,18,21],four:16,fpath:27,frame:20,framework:[15,16,17,23],free:[8,12,16,18,22],freecodecamp:22,frequenc:27,frequent:[3,12,16,20],fresh:2,frill:20,from:[0,1,2,3,4,5,8,9,11,13,14,15,16,17,18,20,21,22,23,24,26,27,28,30],frozenlak:20,frustrat:20,fujimoto:[3,15,17],full:[20,22,25,27,30],fulli:[16,21],fundament:[17,23],furthermor:4,futur:[2,13,14,15,16,17,23],gae:[1,4,5,15,18],gail:15,gain:[17,20,21],game:[0,16],gamma:[0,1,2,3,4,5,11,12,22],gan:15,gap:[0,1,2,3,4,5,15,23],garfinkel:6,gate:20,gauci:15,gaussian:[0,2,3],gave:6,gce:20,gcl:15,gener:[0,1,2,3,4,5,12,15,17,18,20,23,25,27,28,30],get:[0,1,2,3,4,5,6,10,11,12,16,17,18,20,21,22,23,24,25,26,27,28],get_act:26,get_stat:27,git:22,github:[18,22],give:[0,1,2,3,4,5,6,13,16,17,18,20,23,25,27,30],given:[0,1,2,3,4,5,6,11,12,16,17,18,24,25,27,30],glanc:1,global_step:28,global_variables_initi:27,goal:[12,15,16,17,20,25],goe:[1,18],going:[1,12,13,20,22],gone:17,good:[0,12,16,18,20,21,22,23],googl:20,govern:16,gpu:20,grace:27,gracefulli:26,grad:[8,14,28],gradient:[1,4,8,9,12,13,16,17,20,21,23,27,28],grads_and_var:28,gradual:20,grand:20,graph:[0,1,2,3,4,5,8,9,11,21,24],grappl:1,grate:6,gratefulli:6,great:[3,20,21],greedi:20,green:27,greff:20,greg:[6,20],gregor:15,grid:[25,30],ground:[0,6,17],group:[6,20,25,27],grow:[1,20,23],gru:20,grusli:15,guarante:[18,21],guid:[15,16,20],gym:[0,1,2,3,4,5,9,12,20,21,22,25,26,30],haarnoja:[2,15,17],habit:20,hack:26,hackathon:20,hacki:28,had:18,hadfield:6,half:2,halfcheetah:[11,12,20],halfwai:20,hall:6,hammer:20,hand:[15,16,26],handi:24,handicap:20,handl:[1,2,25,26,27,30],happen:[0,3,13,16,17,20,21,23],happi:4,hard:[1,4,16,17,20,23],harder:[3,20],hardest:[12,20],hardwar:20,has:[0,1,2,3,4,5,7,11,16,17,18,20,22,23,25,26,27],hash:15,hasn:[20,22],hasselt:15,hausknecht:15,hausman:15,have:[0,1,2,3,4,6,11,12,13,16,17,18,20,21,22,23,24,25,26,27,30],haven:20,head:2,hear:23,heard:20,heess:[1,15],help:[0,1,2,3,6,14,20,22,23,25,26,30],henc:[3,17],henderson:[15,20],her:[15,17],here:[0,1,2,4,6,12,13,16,17,18,20,22,23,25,26,27,30],hertz:20,hesit:6,hessel:15,hessian:4,hex:17,hid:[22,25],hidden:[12,16,25],hidden_dim:16,hidden_s:[11,18,25,27],hide:23,hierarch:15,hierarchi:[8,20],high:[1,2,4,5,11,15,16,20,21,23,25],higher:[0,2,3,5,16,17],highest:[2,16],highli:[0,18,23],highlight:[0,16,17,18,27],hindsight:[15,17],hint:[12,16],hirl:15,hiro:15,histori:[16,21],hit:[1,22],hobbyist:23,hold:[14,20],homebrew:22,hood:[20,25],hopefulli:20,hopper:[10,12,25],horgan:15,horizon:[2,5,14,15,16,17,18,20],horovod:28,hour:20,houthooft:15,how:[0,1,2,3,4,5,8,12,15,16,17,18,20,22,24,26,27],howev:[3,11,16,23,25,30],http:22,huge:20,human:[6,15,16],humanoid:20,hundr:20,hurdl:20,hurt:4,hyperparamet:[0,1,3,4,9,12,20,26,27,30],hypothes:20,i2a:[15,17],icm:15,idea:[5,16,20,21,22],ideal:20,ident:[15,18,24],idx:27,ignor:17,illustr:[2,25],ilya:15,imag:[20,21],imagin:[15,17],imit:8,immedi:[0,14,16,17,18,23],impact:[12,20],impala:15,implement:[8,11,15,16,17,20,21,23,25,26,27],impli:18,implicit:15,importantli:[2,23],imposs:26,impress:20,improv:[0,1,2,3,4,6,15,17,18,20,21],in_nam:[25,30],inaccur:18,inadvertantli:23,incent:1,includ:[0,1,2,3,4,5,6,8,15,16,17,18,20,22,23,25,27,30],inclus:[16,30],incompat:[17,25],incorpor:2,incorrect:3,increas:[1,2,4,20,27],incred:20,increment:20,incrementalist:20,independ:[2,18],index:[8,16],indic:[0,16,30],indirectli:17,individu:25,infinit:[2,5,14,16,18],influenc:20,info:[4,18,27],info_ph:4,inform:[15,16,17,20,21,22,23,26,27],inher:2,initi:[1,4,5,20,21,27],inner:[13,14],inner_prefix:25,innermost:[3,13],input:[0,1,2,3,4,5,12,16,27],insid:13,insight:[20,21],inspir:20,instabl:17,instal:8,installtest:22,installtest_s0:22,instanc:[16,18,20,26],instanti:21,instead:[0,1,2,3,4,12,16,17,20,23,24,25,30],instruct:[12,16,22,23],instrument:20,int32:[18,27],integ:4,integr:20,intend:[20,23],intens:17,intent:15,interact:[0,1,2,3,4,5,9,16,17,18,21],interactivesess:11,interest:[18,20,23],interestingli:15,interfac:26,interleav:[0,20],intermedi:[4,18],intern:[20,24,27],interpol:[0,2,3,15,17],interpret:[18,24,25,26],intervent:15,intro:[8,20],introduc:[2,3,4,15,16,20],introduct:[0,5,8,12,16,17,20],intuit:[1,16,18,20],invent:20,invers:[2,4,8],invertedpendulum:[12,20],investig:20,involv:[6,11,17,18,23],ioff:20,ipg:15,ipython:26,iqn:15,irl:15,irpan:20,islam:[15,20],isn:[2,4,27,30],issu:[1,3,4,11,16,20,21,22,23],iter:[4,14,17,20,26,27],itr:[26,27],its:[0,1,2,3,4,5,16,17,18,22,27],jack:6,jaderberg:15,jakob:20,job:21,jog:13,john:20,joint:[13,16],jointli:20,jona:6,josh:[7,20],joshua:20,json:[24,26,27],jump:[15,23],just:[0,3,4,14,16,18,20,22,25,26,27,30],justif:15,kakad:[4,15],kalashnikov:15,karpathi:20,keep:[0,1,3,4,20,26,27],kei:[8,17,18,20,21,23,25,26,27,30],kera:27,keyword:[0,1,2,3,4,5,25,27],khan:6,kick:1,kind:[1,3,4,8,16,18,20,23],kingma:20,know:[11,20,23],knowledg:[20,23],known:[12,26],krogh:20,kroneck:15,kurutach:15,kwarg:[0,1,2,3,4,5,25,26,27,28,30],l62:18,l86:18,l99:18,lag:0,lagrangian:4,lai:18,laid:23,lam:[1,4,5],lambda:[1,4,5,25],land:16,landscap:17,langford:[4,15],languag:16,larg:[0,1,4,15],largest:4,larissa:6,last:[12,13,14,16,18,26],lastli:[3,4],late:21,later:[0,2,13,15,16,18,20,26],latest:[1,4,5,26],latter:0,launch:[8,12,17,20,23,26,28,30],law:[14,16],layer:[15,16,20,25,27],lead:[2,3,4,5,15,16,18,20,21],leak:30,leakag:27,leandro:6,learn:[1,3,4,5,7,8,10,11,12,16,18,21,22,23,25,26],learner:[15,20],least:[1,20,22],leav:[11,14,15,22,27],lectur:23,led:[5,21],left:[9,13,30],legend:24,lemma:[8,13],len:[18,26,27],length:[0,1,2,3,4,5,12,18,20,26],less:[1,3,4,5,17,20],lesson:20,let:[0,1,2,4,5,8,11,14,16,20,22,25,26,27],letter:30,level:[16,20,21,23],lfp:15,liang:15,libopenmpi:22,librari:[15,20,22,25],licens:[22,26],lie:3,life:18,like:[0,1,2,4,12,13,16,17,18,20,21,22,23,24,25,26,27],likelihood:16,lilian:[6,20],lillicrap:[0,15,17,20],limit:[1,3,4,25],line:[4,8,9,11,16,18,20,21,23,26,27],linear:16,link:[8,15,28],linux:22,list:[6,11,15,17,18,20,23,25,30],liter:18,literatur:[16,18,20,23],littl:[2,12,16,17],liu:15,live:[16,17,23],load:[0,1,2,3,4,5,8,23],load_data:27,load_polici:26,local:[1,2,4,5,17,20,25,27],locat:[8,25],lock:23,locomot:[1,15],log:[1,2,4,5,8,14,16,21,23,24,26],log_prob:18,log_softmax:18,log_tabular:27,logdir:24,logger:[0,1,2,3,4,5,8,21,26,30],logger_kwarg:[0,1,2,3,4,5,25,27,30],logic:[13,21],logit:[16,18,27],logp:[1,4,5],logp_pi:[1,2,4,5],logprob:18,logx:27,longer:20,look:[1,12,15,20,22,24,26,27,28],loop:[16,17,18,21,27],lose:[16,20],loss:[0,1,2,3,11,12,18,20,21,27,28],lost:27,lot:[12,17,20],low:[2,18],lower:[2,5],lstm:20,lucid:5,lunarland:[22,25],machin:[12,20,23,25,30],macro:15,maddi:6,made:[13,21],mahmood:15,mai:[0,1,2,3,4,5,6,12,16,17,20,22,24,30],main:[0,1,2,3,4,5,16,17,18,20,21,27],maintain:27,mainten:30,make:[0,1,2,3,4,5,10,12,13,16,17,18,20,21,22,23,25,26,27,28,30],maml:15,manag:[21,22],mani:[6,12,15,16,17,18,20,22,23,24,25,26,27,28,30],mania:15,manipul:15,manner:26,map:[15,16,26,27],margin:13,markov:[14,16],mask:[11,18],mass:2,master:15,masteri:20,match:[23,24,27],materi:[6,17,18,20,23],math:[0,1,4,16,18,20,23],mathemat:[11,13,16,18,20,21],matl:15,matrix:[4,11,16],matter:[4,15,16,20,23],matthew:20,matthia:6,max:[0,1,12,20,27,28],max_ep_len:[0,1,2,3,4,5],maxim:[0,1,2,3,15,16,17,18],maximum:[0,1,2,3,4,5,12,15,26],maxtest:27,mayb:30,mbmf:[15,17],mbve:[15,17],mdp:[15,16],mean:[0,1,2,3,4,5,9,12,16,17,18,20,26,27,28],meaning:20,meant:[16,26],measur:[2,4,9,16,18,20,24],mechan:20,memori:[8,20],menel:6,mention:0,merg:15,merit:20,merlin:15,mess:11,messag:27,meta:[8,17,20],metamim:15,method:[1,3,4,5,12,15,16,17,18,20,21,25,27,28],methodolog:15,metric:[9,18,21,26,27],mfec:15,middl:23,might:[0,11,12,16,18,20,22],mile:6,million:4,min:[1,20,27,28],mind:20,minibatch:[0,1,2,3,9],minim:[0,2,3,18,23,27],minimum:2,mintest:27,minut:[12,20,22],mirag:15,mishra:15,miss:23,mission:8,mitig:21,mlp:[11,12,16,18,20,21,27],mlp_actor_crit:[0,1,2,3,4,5,11],mnih:[15,17,20],mnist:27,mode:[3,8,17,20],model:[8,20,21,26,27],model_info:26,modern:[5,17],modif:[4,18,23,28],modifi:26,modul:[8,15,23,26],modular:[17,20],modularrl:[1,4],momentum:20,monoton:[4,15,20],mont:17,month:[20,23],more:[0,1,2,4,12,16,17,18,20,23,24,25,26,28],morn:13,most:[0,14,16,17,18,20,21,23,25,26,27],mostli:16,motiv:[0,1,4,20],motor:15,move:[16,20,23],mpc:17,mpi:[1,4,5,8,23,25,30],mpi_avg:28,mpi_fork:28,mpi_statistics_scalar:28,mpi_tf:28,mpi_tool:28,mpiadamoptim:28,mpo:15,msbe:[0,2],msg:27,much:[0,1,3,16,17,18,20,21],mujoco:[8,9,12,21],mujocotest:22,mujuco:[],multi:[20,23],multinomi:[16,18],multipl:[1,13,16,24,26,27,30],multipli:11,multitask:8,multivari:[16,20],must:[0,1,2,3,4,5,16,18,25,30],mutual:15,n_act:18,nachum:15,nagabandi:[15,17],name:[6,16,18,20,24,25,26,27,28,30],nameofquant:27,narrowli:20,natur:[4,15,16,20,23],nec:15,necessari:20,necessarili:23,need:[0,2,4,13,15,16,18,20,21,22,23,24,26,27],neg:[1,18],nest:30,net:[15,16,20],network:[0,2,3,4,5,9,11,12,15,16,17,18,20,25],neural:[0,2,4,5,12,15,16,17,18,20,25],neutral:18,never:[12,17,20],next:[0,3,16,20,21,23,27],nice:[4,25,26],nois:[0,2,3,9,16,18],noise_clip:3,non:[0,11,12,17,21,23],none:[11,16,18,25,26,27,28,30],nonetheless:23,nonneg:[4,16],nonzero:18,norend:26,norm:20,normal:[0,2,3,4,16,18,20,23,28],notabl:16,notat:[0,14,16,17],note:[0,1,2,16,24,25,26,27,30],noth:[18,25],notion:16,nov:23,novemb:22,novic:20,now:[2,13,16,17,18,20],npg:4,num_cpu:[25,30],num_proc:28,num_run:25,num_sampl:18,number:[0,1,2,3,4,5,6,16,17,18,20,23,24,25,26,27,28,30],numer:[4,18,27],numpi:[11,27],nut:20,nutshel:16,obei:16,object:[1,4,16,17,18,20,21,23,25],obs:[16,18],obs_dim:[16,18],obs_ph:18,obscur:23,observ:[0,12,15,17,18,21,23,27],obtain:[0,2,13,16,17,18],obviou:23,octob:20,odd:[13,20],odyssei:20,off:[0,1,2,3,6,9,15,16,17,20,23,24],often:[0,1,2,3,4,5,16,18,20,25],olah:20,old:[0,1,4,18,21,26],older:26,olsson:20,omit:[16,17,23,30],onc:[0,1,3,4,9,20,22,27],one:[2,3,12,16,18,20,21,22,23,24,25,26,30],one_hot:[18,27],ones:[17,20,25],onli:[0,1,2,3,4,11,12,13,14,16,17,18,20,22,23,24,25,26,27],open:[15,20],openai:[0,1,2,3,4,5,7,12,15,20,22,23,25],openmpi:8,oper:4,oppos:20,ops:21,opt:15,optim:[0,2,5,8,15,17,20,21,28],optima:[1,4,5],optimum:[2,15],option:[8,13,15,17,18,20,24,26,27,28,30],order:[0,1,4,6,12,16,20,21,22,23],organ:[20,22,23,26],orient:20,origin:[0,1,2,4,13,20,23,28],orthogon:17,ostrovski:15,osx:22,other:[8,9,12,16,17,21,23,26,27],otherwis:[5,6,20,22,23,24,25,27],ouput:11,our:[0,1,2,3,4,5,8,12,13,16,17,18,26],out:[1,2,3,6,11,12,13,14,17,18,20,21,22,23,24,25,26,27],outcom:[2,18],outdat:0,outer:[11,13],outer_prefix:25,outlin:20,output:[0,1,2,3,4,5,8,12,16,17,18,24,25,27],output_activ:[11,27],output_dir:[25,27,30],output_directori:[24,26],output_fnam:27,output_typ:27,outsid:13,outward:3,over:[0,1,2,3,4,5,9,10,11,12,13,14,16,17,18,20,24,25,27,28],overal:20,overestim:3,overfit:[0,18,20],overview:20,overwhelm:22,overwrit:[26,27],own:[16,20],packag:[21,22,23,25,26],page:[2,8,9,12,17,18,22,23,25,26,27,28,29],pai:17,pain:[2,15,23,26],painfulli:[0,4],painless:23,pair:[0,1,2,3,4,5,16,18,30],paper:[8,17,18,20,23],paradigm:23,parallel:[0,1,2,3,4,5,15,20,25,27],param:[25,28,30],param_nam:25,paramet:[0,1,2,3,4,5,16,17,18,20,21,24,25,27,28,30],parameter:[2,16,18],parisotto:15,pars:23,parse_arg:25,parser:25,part:[0,3,5,8,11,12,20],partial:[15,16,20,26],particular:[3,6,16,18,20,21,25],particularli:[1,17,26],partli:11,pass:[21,25,26,27,30],past:[8,14,20],path:[12,20,24,25,26,30],pathak:15,pathnet:15,patient:22,pattern:23,pcl:15,peak:3,peek:20,penal:1,penalti:[1,15],peng:15,peopl:[6,20,23],per:[0,1,4,5,9,15,25,27],perceiv:16,perfect:20,perfectli:0,perform:[0,1,2,3,4,5,8,12,15,17,18,20,21,23,24,25,26],period:21,person:[20,22,23],perspect:[15,20,23],peter:[6,15],pg_math:18,pgql:15,phd:7,philosophi:8,physic:[15,16,22],pi_loss:[11,12],pi_lr:[0,1,3,5],pick:[16,24],pickl:26,piec:[18,23,30],pieter:[6,7],pip:22,pipe:20,pixel:16,pixelcnn:15,pkl:26,place:[16,17,20],placehold:[0,1,2,3,4,5,12,16,18,21,27],plai:[4,11,15,16,17,20,23],plan:[8,17],plant:20,plappert:6,platform:15,plausibl:20,pleas:[6,16,22,26],plot:[8,12,22,25,29],plotter:[8,24,26,27],plu:[2,16,20,30],plug:18,point:[0,2,6,13,15,16,17,18,20,23,26],polici:[2,3,8,9,13,17,20,22,24,25],policy_delai:3,polyak:[0,2,3],popular:17,pose:0,posit:[1,4,24],possibl:[0,1,4,12,16,17,20,22,23,25,26,27],post:23,potenti:[17,20,21,30],power:23,ppo:[1,2,9,15,16,17,18,20,21,22,25],ppo_ant:25,ppo_ant_cli0:25,practic:[16,18],practition:[18,23],pre:26,preced:14,precis:16,precommit:20,predat:21,predict:[15,17,27],prefer:[0,15,16,22,27],prefix:24,prematur:2,prepar:[17,20,27],preprocess:27,preregistr:20,prescrib:20,presenc:16,present:[1,6,17,23,26],presum:[0,30],pretti:[1,3,16,18,20,23],prevent:[2,20,22,26,27],previou:[0,18,20],previous:[0,1,2,3,4,5,7,12,27],primari:[1,17],primarili:7,principl:[15,17],print:[26,27,30],prior:[16,20],priorit:15,pritzel:15,prob:[8,14],probabl:[0,1,2,3,4,5,13,16,18,20,22],problem:[0,1,2,4,8,15,17,18,20,22,26],problem_set_1:12,problem_set_1_solut:12,problem_set_2:12,problemat:[0,21],proc:30,proc_id:28,proce:13,procedur:[1,4,5,17,27],process:[11,12,14,16,20,25,26,27,28,30],produc:[4,11,17,23,24,25,26,28],product:[4,11,16],profess:23,profit:1,profound:16,program:[0,1,2,3,4,5,20],progress:[1,4,5,10,12,15,20,21,22,23,26,27],project:[6,8,26],promin:17,promis:20,prop:15,properti:16,proport:[2,18],propos:20,proprietari:22,protocol:26,prove:[4,13,16,20],provid:[0,1,2,3,4,5,15,16,24,25,27],proxim:[8,15,17,20,21],pseudocod:[12,23],pseudocount:15,publish:[2,7,21,23],pull:[13,14],punish:16,pure:17,purpos:[15,18,27,28],push:[5,18,20],put:[0,1,11,12,16,18],python3:22,python:[0,8,12,24,25,26,30],pytorch:20,q1_pi:[2,3],q2_pi:2,q_loss:[11,12],q_lr:[0,3],q_pi:[0,11,12],q_pi_targ:[11,12],qualiti:[0,3,12],quantil:[15,17],quantiti:[2,4,27],quantity_valu:27,question:[1,16,17,23],quickli:[3,4,10,12,20,22,23],quit:[10,16,17,20],r2d2:15,r_ph:[11,12],rahtz:20,rai:[0,1,5,6],rainbow:15,rais:[16,18,25],ramp:23,ran:26,randint:27,random:[0,1,2,3,4,5,9,10,11,12,13,15,18,20,21,24,25,27,30],random_norm:16,randomli:16,rang:[0,2,3,17,18,20,23,25,27,30],rank:[27,28],rate:[0,1,2,3,4,5],raw:[16,20],reach:[6,12,26],reactor:15,read:[15,18,20,22,23],readi:[4,20],readm:22,readout:25,readthedoc:26,real:[0,2,3,8,16,17],realli:[17,18,20,24],rearrang:18,reason:[0,1,16,17,20,21,23,26],recal:[18,24],recap:[0,8],receiv:[23,25],recent:[0,5,14,16,17,18,21,26,27],recip:20,recognit:20,recommend:[0,3,18,22,25],record:[18,26,27],recreat:26,recurr:[15,20,21],recurs:24,reduc:[0,1,3,18,20],reduce_mean:[11,12,18,27],reduce_sum:18,reevalu:15,refer:[8,15,16,18,22,23,24,25],referenc:12,reflect:[16,21],refresh:18,regardless:[17,27],region:[8,15,17,21],regress:[3,15,17],regular:[1,3,17,20,23],reimplement:[12,20],reinforc:[0,4,5,7,8,13,16,18,20,23],reinvent:20,rel:[4,12,16,17,18,25],relat:[0,7,15,16,20,21],releas:[0,2,3,6,23],relev:[21,23,27],reli:[1,13,24],reliabl:[17,20,21],relu:[9,11,25,27],remaind:14,remov:[1,2,18,20,23,24],render:[18,26],reparameter:[2,20],repeat:16,repeatedli:18,replai:[0,2,3,15,17],replay_s:[0,2,3],repo:[0,2,3,23],report:[9,20,23,24,27,30],repositori:15,repres:[2,16,17,18,21,26],represent:17,reproduc:[8,20],requir:[2,4,20,22,23],research:[7,8,15,18,21,22,23,25],reset:[15,18],reshap:[11,18,27],residu:20,resnet:20,resort:20,resourc:[8,23,25],respect:[0,2,3,4,13,14,17,18,20],rest:[2,14,17,18],restor:[26,27],restore_tf_graph:[0,1,2,3,4,5,26,27],result:[0,2,3,4,8,11,12,13,14,15,16,17,18,20,22,23,26,27,29,30],resum:26,retro:20,reus:[11,17,20,21],reveal:15,revers:18,review:[8,12,17,23],rew:18,reward:[0,1,2,4,5,8,11,13,14,15,17,20,21],reward_to_go:18,rework:16,rewrit:[2,14],rgb:16,rich:[1,15],richer:17,right:[0,2,8,16,18],rigor:8,risk:20,rllab:[0,1,4,5,15,20,23,30],rllib:[0,1,5,15,23],rmc:15,rnd:15,rnn:20,roadmap:23,robot:[15,16,22],robust:20,roi:[15,17],role:23,rollout:[0,1,2,3,4,5,16,26],root:15,rothfuss:6,rough:20,roughli:[0,1,2,23],row:11,rtg:18,ruder:20,rule:[0,1,2,4,5,13,16,18,20,24],run:[0,1,2,3,4,5,8,9,11,12,16,18,20,21,22,23,24,27],run_entrypoint:30,run_kwarg:25,run_polici:26,run_util:[25,30],rusu:15,s_t:18,sac:[2,9,15,17,21,23,26],safe:[7,15,23],safeti:[7,8,20,23],sai:[0,1,2,12,13,16,22,24],said:[17,23],salakhutdinov:15,saliman:[15,20],same:[0,1,2,3,4,11,12,13,14,16,18,20,24,25,26,27,28],sampl:[0,1,2,3,4,5,12,15,16,17,18,20,21,27,28],santoro:15,satisfi:[0,1,2,3,4,5,17,21],saunder:15,save:[8,18,21,23,30],save_config:27,save_freq:[0,1,2,3,4,5,26,27],save_st:[26,27],saved_model:26,savedmodel:26,saver:26,scalabl:15,scalar:28,scale:[0,1,2,3,8,20,25],schaal:15,schaul:15,scheme:20,schiavo:6,schmidhub:[15,17],scholar:23,schulman:[1,4,5,15,17,20],scienc:[20,22,23],scientif:[20,23],scientist:7,scope:20,score:[12,17],scour:20,scratch:20,screen:26,script:[8,27,28],search:[4,8,15,17,20,24],sebastian:20,second:[0,1,16],section:[0,12,14,17,18,20,21,25,26],see:[0,1,2,3,4,9,11,12,16,17,18,20,21,22,23,24,25,26,27,28,29],seed0:24,seed10:24,seed:[0,1,2,3,4,5,9,10,11,12,20,21,24,25,27,30],seem:[1,18,20],seemingli:4,seen:[1,18],sel:24,select:[0,1,2,3,4,5,16,17,18,24],self:[15,16,17,23,30],send:18,sens:[16,17,18,20,23,26],separ:[0,11,13,20,23,24,26,27,30],septemb:17,sequenc:[16,20,30],serendipit:17,seri:[20,23,25],serial:[26,27,30],seriou:23,serv:[1,3,8,15,25,27],servic:[6,15],sess:[11,18,27],session:[21,27],set:[0,2,3,4,5,8,9,14,16,18,20,21,23,24,26,27,30],setup:[21,26,27],setup_logger_kwarg:30,setup_tf_sav:27,sever:[15,17,20,23,24,25,27],sgd:[0,1,2,3,20],shade:9,shape:[0,1,2,3,4,5,11,16,18,27],share:[16,23,24,25],sharp:3,shift:20,ship:[20,24,25,26,27,30],shogi:15,shorten:[14,30],shorter:[17,22],shortest:20,shorthand:[0,16,25,30],shot:15,should:[6,12,13,15,20,23,30],show:[0,3,5,9,13,14,15,18,24],side:[16,17,18],sidestep:4,signal:[0,3,16,18,20],signific:[12,20],significantli:[1,20],silent:20,silver:[0,15,17],similar:[3,20,23,25],similarli:[21,23],simpl:[0,1,15,16,18,20,21,22,23,24,25,27],simple_sav:26,simpler:[0,1,20,30],simplest:[8,13,20],simpli:[17,25],simplic:20,simplif:20,simplifi:[1,16],simul:[16,22],simultan:[15,26],sinc:[0,4,25],singl:[1,3,4,16,18,20,21,23,25,27],situat:26,six:[12,23],size:[0,2,3,4,9,11,12,16,18,25],skill:[15,20,23],slide:20,slight:18,slightli:[2,4,18,20,26,30],slow:[0,4,15,26],small:[1,4,17,20,25],smaller:[3,12],smallest:4,smallish:4,smooth:[2,3,24],snag:22,snail:15,snapshot:26,snippet:16,soak:25,soft:[8,15,17,21,23],softmax:16,softmax_cross_entropi:27,softwar:[6,22],soil:20,solid:[9,20],solut:[0,4,12,25],solv:[0,1,4,15,20,23],some:[0,3,4,11,16,17,18,20,21,22,23,24,25,26,27],someon:15,somerandomnumb:27,someth:[2,4,16,17,20,23,26,27,28],sometim:[3,12,16,18,20,26],somewher:20,sonic:20,soon:21,sophist:[16,23,28],sort:[20,25],sota:[20,21],sourc:[0,1,2,3,4,5,15,22,27,28,30],space:[0,1,2,3,4,5,15,17,20,21],speak:2,special:[1,4,16,20,26,30],specif:[0,11,12,16,18,20,21,23,30],specifi:[4,24,25,26,27],spectrum:17,speedup:25,spend:20,spent:20,spheric:16,spin:[0,1,2,3,4,5,6,7,12,18,21,23,24,25,26,27,30],spinningup:[22,25,26,30],spinup:[0,1,2,3,4,5,12,18,22,24,25,26,27,28,30],split:[13,21,28,30],squar:[0,2,3,18,25],squash:2,squeez:[11,18],srivastava:20,ss_exp_name_:30,stabil:[4,17,21],stabl:[0,17,18],stablest:2,stack:16,stage:20,stand:12,standalon:[16,23],standard:[0,2,16,17,18,20,21,22,23,25,27],stanford:20,stark:16,start:[0,2,3,14,15,16,17,18,20,21,22,23],start_step:[0,2,3],start_tim:27,starter:12,stat:20,state:[0,1,2,3,4,5,13,14,17,18,20,23,26,27,30],state_dict:27,statist:[4,20,27,28],stave:1,std:[2,9,12,16,20,27,28],stddev:[0,3],stdout:27,stdtest:27,steep:20,steeper:20,stein:15,step:[0,1,2,3,4,5,9,13,16,18,20,23,30],steps_per_epoch:[0,1,2,3,4,5,12,25,27,30],steve:15,still:[1,2,15,20,23],stochast:[0,1,2,4,5,15,17,18,20,26],stone:15,stook:15,stop:[1,4],stop_gradi:[11,12],store:[4,21,25,26,27,30],straightforward:17,strateg:15,strategi:[15,16,23],straw:15,streamlin:23,strength:[17,20],strictli:20,string:[24,25,26,27,30],strong:20,stronger:[20,21],strongest:20,strongli:18,structur:[15,17,21,23,24,26,30],stuck:[10,20],student:[7,18,22,23],studi:[7,15,16,18,20,23],stuff:22,style:[0,2,17],sub:17,subfold:25,subject:[2,16,20],submit:24,subprocess:30,subroutin:[0,17,20],subscript:16,subsect:[13,18],substanti:[3,10,16,17,20],substitut:[2,16],substr:[24,25],subtl:4,subtract:18,succe:[20,22],success:[12,16,30],successfulli:22,successor:2,sudo:22,suffici:4,suggest:[0,12,20],suit:9,suitabl:21,sum:[16,18],supersed:20,supervis:[15,18,20],supplementari:20,suppli:[23,25],support:[0,1,2,3,4,5,6,8,20,21,22,25,26],suppos:[0,1,16,18,24],sure:[1,4,5,20,23,25,26,27],surpris:15,surrog:[4,17],suspect:17,sutton:[5,15,17],swap:20,symbol:[0,1,2,3,4,5,12,16],sync:28,sync_all_param:28,synchron:20,system:[16,22],systemat:[15,20],szegedi:20,szepesvari:[15,17,20],tab:[26,27],tack:20,tailor:27,take:[0,1,2,3,4,5,12,16,17,18,20,23,25,30],taken:[16,17,18,28,30],talk:16,tang:15,tanh:[9,11,16,25,27],target:[0,2,3,11],target_kl:1,target_nois:3,task:[9,15,17,20,21,23,25],tau:18,taught:16,taxonomi:[8,15],taylor:[4,20],td3:[0,2,3,9,15,17,21,23],teach:[16,23],team:7,technic:16,techniqu:[17,20,21],technolog:23,tell:[0,1,16,20,24],temp:27,templat:21,tempor:15,ten:9,tend:[2,4,16,17],tensor:[11,16,18,26,27],tensorflow:[0,1,2,3,4,5,8,11,12,16,18,20,22,25,26,27],tensorshap:11,term:[0,1,2,3,4,5,12,13,16,18,20,23,30],termin:[0,26,28],terminolog:[8,17,20,23],terribl:17,test:[0,2,3,6,9,17,20,21,22,26,27],test_polici:[0,1,2,3,4,5,22,26],textbook:23,than:[1,3,12,16,17,18,20,24,25,30],thank:6,thei:[0,1,2,4,11,12,16,17,18,20,21,23,25,26,27],them:[0,12,15,16,17,18,20,21,23,24,25,26,27],theorem:[15,20],theoret:[0,4,15],theori:[0,5,8,18,20,21,23],thi:[0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,20,21,22,24,26,27,28,30],thin:[25,27],thing:[0,4,12,14,16,17,18,20,22,25,26,27],think:[1,2,6,12,13,15,16,17,20,23],thorough:[15,20],thoroughli:20,those:[9,12,16,17,18,20,24,25,26],though:[0,2,13,16,18,22,25],thought:[0,2,8],thousand:4,thread:20,three:[3,10,11,12,18,23,30],threshold:1,through:[2,4,16,17,18,20,21,22,23,24,25],throughout:[0,3,18,23,26,27],thu:[0,1,2,3,4,13,18],thunk:[25,30],tie:18,tied:20,tim:20,time:[0,2,3,12,13,14,15,16,17,20,22,23,25,27],timeless:5,timelimit:26,timestamp:[25,30],timestep:[2,9,14],tmp:[26,27],togeth:[0,3,11,17,18],toi:20,too:[0,1,4,20],tool:[0,1,2,3,4,5,8,16,23,25,26,27,30],top:[16,27],topic:[7,18,20,23],total:[11,22],totalenvinteract:24,totalgradientstep:27,toward:[0,1,2,3,15,22],tpro:9,trace:15,traceback:26,track:[22,27,30],trade:[2,17,21,23],tradeoff:2,trail:21,train:[0,1,2,3,4,5,8,9,10,15,16,17,18,20,21,22,25,27],train_mnist:27,train_one_epoch:18,train_op:[18,27],train_pi_it:1,train_v_it:[1,4,5,10,12],trajectori:[0,1,2,3,4,5,9,13,14,18,20,26],transfer:[8,17,20],transit:[0,16,17],translat:[18,20],transmut:23,trap:[1,4,5],treat:[0,11,13,16],treatment:[18,25],tree:[15,17],tri:[1,23],triad:17,trial:[15,16,22,23],trick:[0,1,2,3,17,18,20,23],tricki:20,trivial:[0,11],troubl:[20,22],trpo:[1,2,4,10,15,16,17,18,21],truli:15,trust:[8,15,17,21],truth:17,tsitsikli:[15,17],tucker:15,tune:[0,2,3,15,17,20],tupl:[0,16,18],turn:[1,11,17,18,20],turnaround:20,tutori:20,tweak:[18,20,26],twin:[8,17,21],two:[0,1,2,3,13,15,16,17,18,20,21,25,30],txt:[26,27],type:25,typic:[1,3,4,5,12,16,17,18,22,23,24,27],ucl:20,ultim:[18,21],unaccept:0,unbeat:15,unchang:3,unclear:16,uncommon:16,uncorrel:[0,3],under:[16,17,20,25],underli:[0,5,16],underscor:30,understand:[20,23],undiscount:[5,14,16,18],unfamiliar:[20,23],unifi:15,uniform:[0,2,3],unintent:15,uniqu:27,unit:[9,16,27],univers:15,unless:[20,24,25],unlik:17,unnecessari:20,unpack:26,unreal:15,unreason:20,unseri:30,unsolv:20,unstabl:[0,21],until:[5,21],untrain:10,upcom:23,updat:[0,1,2,3,4,5,9,17,18,20,21,22,23,28],ups:23,upsid:17,use:[0,1,2,3,4,5,9,12,13,16,17,18,20,21,22,23,24,25,26,27,28],use_soln:12,used:[0,1,2,3,4,5,9,12,16,17,18,21,22,23,24,25,26,27,30],useful:[0,3,12,15,16,17,18,20,21,22,25,30],useless:18,user:[8,23,25,30],user_config:[25,26,30],uses:[0,1,2,3,12,17,20,25],using:[0,1,2,3,4,12,13,14,15,16,17,18,20,21,23,25,26,27,30],usual:[0,1,2,3,4,16,17,18,20,27],util:[8,21,23,24,25,26,27],uvfa:15,vail:15,val:[27,30],valid:[0,2,3,16,18,20,25,30],valor:15,valu:[0,1,2,3,4,5,9,10,11,15,17,18,20,21,24,25,27,30],valuabl:20,value_1:25,value_2:25,van:[15,17],vandenbergh:4,vanilla:[4,8,18,20,21],var_list:28,vari:[2,24,25],variabl:[2,13,18,20,26,28],variable_scop:11,varianc:[18,20,25],variant:[0,1,17,20,27,30],variant_nam:30,variantgener:30,variat:[15,20],varieti:[0,3,15,16,20,21,23],variou:[6,12,20,21,25],vaswani:20,vector:[2,4,11,16,28],veloc:16,veri:[0,4,12,16,18,21,22],verifi:[2,20],version:[1,2,4,5,11,12,17,18,20,26,27],versu:12,vestigi:25,vezhnevet:15,vf_lr:[1,4,5],via:[0,1,5,12,13,15,16,17,21,24,27],vic:15,video:[20,22],view:[18,23],vime:15,virtual:23,vision:15,visit:4,visual:[16,20],vivo:12,volatil:3,vpg:[2,4,5,9,16,18,20,21],wai:[0,1,2,3,4,5,13,16,17,18,20,21,22,23,24,25,26,27,30],walker2d:[9,22,25],walker:25,wall:12,wang:15,want:[0,16,17,20,22,23,25,26,27,30],wari:20,warm:23,wasn:[21,26],watch:[20,22,25,26],wayn:15,weak:[17,20],weaker:[20,21],weber:[15,17],week:[20,23],weight:[2,15,16,18,20],weights_ph:18,weird:11,welcom:[16,23],well:[0,1,2,3,17,18,20,22,23,27],weng:[6,20],went:13,were:[0,3,4,6,9,20],what:[0,1,2,3,8,11,12,13,15,18,20,22,24,25,26,27],whatev:[16,20,27],whatsoev:18,wheel:20,when:[0,1,4,11,12,14,16,17,18,20,21,22,26,27,30],whenev:[4,16,20],where:[0,1,2,3,4,5,11,12,13,15,16,17,18,20,21,23,26,30],wherea:11,wherev:16,whether:[0,16,17,30],which:[0,1,2,3,4,5,11,12,14,15,16,17,18,20,21,22,23,24,25,26,27,30],whichev:[3,16],who:[6,20,23],whole:[0,1,2,3,4,5,11,20],whose:[16,17,18,27,30],why:[2,8,13,16,17,22],wide:[0,3,15,16,20,23,24],widespread:18,willing:17,win:20,wind:2,window:[16,17,22,24],wipe:27,wish:0,with_min_and_max:[27,28],within:12,without:[1,4,9,12,15,18,20,23,28],won:[16,18,22],wonder:21,word:16,work:[0,2,3,4,5,7,12,13,17,18,20,21,22,23,24],workaround:22,worker:[27,28],world:[8,16,17],worri:[13,22],wors:18,worth:[0,1,4,15,20,22],worthwhil:[18,20],would:[0,3,4,11,12,16,17,18,20,23,27],wouldn:[20,25],wrap:[20,26,30],wrapper:[25,26],write:[13,16,17,20,22,25,27],writer:15,written:[11,20,27],wrong:[12,18,20],wulfmeier:15,x_ph:[0,1,2,3,4,5,27],x_train:27,xaxi:24,y_ph:27,y_train:27,yang:6,year:[20,22,23],yet:[2,20,23],yield:[4,20],you:[5,6,8,11,12,15,17,20,21,23,28,30],younger:21,your:[0,8,12,16,20,26,27],your_env:26,yourself:20,zeigler:6,zero:[0,3,4,13,18],zeros_lik:18,ziebart:15,zokhov:6,zshell:25},titles:["Deep Deterministic Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Twin Delayed DDPG","Trust Region Policy Optimization","Vanilla Policy Gradient","Acknowledgements","About the Author","Welcome to Spinning Up in Deep RL!","Benchmarks for Spinning Up Implementations","Solution to Exercise 2.1","Solution to Exercise 2.2","Exercises","Extra Material","Extra Material","Key Papers in Deep RL","Part 1: Key Concepts in RL","Part 2: Kinds of RL Algorithms","Part 3: Intro to Policy Optimization","Limitations and Frontiers","Spinning Up as a Deep RL Researcher","Algorithms","Installation","Introduction","Plotting Results","Running Experiments","Experiment Outputs","Logger","MPI Tools","Plotter","Run Utils"],titleterms:{"class":27,"function":[12,14,16,26],"long":19,"public":[0,1,2,3,4,5],"return":16,Doing:20,Not:26,One:25,The:[0,11,16,20,21],These:[0,1,4,5,21],Using:[14,25,26,27],about:7,acknowledg:6,action:[15,16],actor:2,actual:25,advantag:16,algorithm:[12,15,17,21,26],analysi:15,ant:9,author:7,background:[0,1,2,3,4,5,20],base:[15,17],baselin:[15,18],basic:[12,18],bellman:16,benchmark:9,bonu:15,bug:[11,12],built:23,call:30,can:16,categor:16,caution:[],challeng:12,check:22,classic:15,close:20,code:[11,12,21,23],combin:15,command:25,complex:19,comput:12,concept:16,config:25,consist:15,content:[0,1,2,3,4,5,9,12,15,16,17,18,20,21,22,23,25,26,27,28,30],core:[21,28],critic:2,critiqu:15,ddpg:[0,3,12],deep:[0,8,15,20],delai:3,depend:15,deriv:18,design:[19,23],detail:[9,25],determin:25,determinist:[0,15,16],develop:20,diagon:16,direct:15,directori:26,distract:[13,18],distribut:15,document:[0,1,2,3,4,5],don:[13,18,25],each:9,entropi:2,environ:[9,25,26],equat:[0,1,2,3,4,5,16],error:26,evolutionari:15,exampl:27,exercis:[10,11,12],expect:18,experi:[9,25,26,30],experimentgrid:[25,30],exploit:[0,1,2,3,4,5],explor:[0,1,2,3,4,5,15],extra:[13,14,25],fact:[0,1,2,3,4,5],failur:12,file:21,fit:12,flag:25,form:18,formal:16,format:21,formula:14,found:26,free:[15,17],from:[12,25],frontier:19,gaussian:[12,16],given:15,grad:18,gradient:[0,5,14,15,18],graph:[12,27],guid:25,gum:11,gym:[],halfcheetah:9,hierarchi:15,hopper:9,horizon:19,how:[11,23,25],hyperparamet:25,imit:15,implement:[0,1,2,3,4,5,9,12,18],includ:21,indic:8,instal:22,intrins:15,intro:18,introduct:23,invers:15,kei:[0,1,2,3,4,5,15,16],kind:17,know:[0,1,2,3,4,16,18,22,24,25,26,27],launch:25,learn:[0,2,15,17,20],lemma:18,let:[13,18],likelihood:12,limit:19,line:25,link:17,load:[26,27],locat:26,log:[12,18,27],logger:27,mac:22,materi:[13,14],memori:15,meta:15,mission:23,mode:12,model:[0,1,2,3,4,5,15,17],motiv:15,mpi:[27,28],mujoco:22,multipl:25,multitask:15,need:25,observ:16,off:21,onc:25,openai:[],openmpi:22,optim:[1,4,16,18],option:[16,22],other:[0,1,2,3,4,5,15,18,20],our:23,output:26,paper:[0,1,2,3,4,5,15],part:[16,17,18],past:[13,18],path:15,perform:9,philosophi:23,plan:23,plot:24,plotter:29,polici:[0,1,4,5,12,14,15,16,18,21,26],ppo:12,prob:18,problem:[12,16],project:20,proof:[13,14,18],proxim:1,pseudocod:[0,1,2,3,4,5],python:22,quick:[0,1,2,3,4,5],quickstart:25,real:15,recap:18,refer:[0,1,2,3,4,5,20],region:4,regular:2,reinforc:[2,15],relev:[0,1,2,3,4,5],reproduc:15,request:12,research:[12,20],resourc:20,result:[24,25],review:15,reward:[16,18,19],right:20,rigor:20,run:[25,26,30],safeti:15,sampl:19,save:[0,1,2,3,4,5,25,26,27],scale:15,scratch:12,script:25,serv:23,set:[12,25],shortcut:25,should:[0,1,2,3,4,16,18,22,24,25,26,27],side:0,silent:12,simplest:18,soft:2,solut:[10,11],space:16,special:25,spin:[8,9,20,22],state:16,stochast:16,successfulli:26,suffix:25,support:23,swimmer:9,tabl:[0,1,2,3,4,5,8,9,12,15,16,17,18,20,21,22,23,25,26,27,28,30],task:19,taxonomi:17,td3:12,tensorflow:28,terminolog:16,theori:15,thi:[23,25],thought:20,tool:28,train:26,trajectori:16,transfer:15,trpo:12,trust:4,twin:3,ubuntu:22,unsupervis:15,util:[28,30],valu:[12,16,26],vanilla:5,walker:9,warn:[],welcom:8,what:[16,17,21,23],where:25,why:[0,1,4,5,21,23],work:11,world:15,write:12,you:[0,1,2,3,4,13,16,18,22,24,25,26,27],your:22}})
\ No newline at end of file
+Search.setIndex({docnames:["algorithms/ddpg","algorithms/ppo","algorithms/sac","algorithms/td3","algorithms/trpo","algorithms/vpg","etc/acknowledgements","etc/author","index","spinningup/bench","spinningup/exercise2_1_soln","spinningup/exercise2_2_soln","spinningup/exercises","spinningup/extra_pg_proof1","spinningup/extra_pg_proof2","spinningup/keypapers","spinningup/rl_intro","spinningup/rl_intro2","spinningup/rl_intro3","spinningup/rl_intro4","spinningup/spinningup","user/algorithms","user/installation","user/introduction","user/plotting","user/running","user/saving_and_loading","utils/logger","utils/mpi","utils/plotter","utils/run_utils"],envversion:51,filenames:["algorithms/ddpg.rst","algorithms/ppo.rst","algorithms/sac.rst","algorithms/td3.rst","algorithms/trpo.rst","algorithms/vpg.rst","etc/acknowledgements.rst","etc/author.rst","index.rst","spinningup/bench.rst","spinningup/exercise2_1_soln.rst","spinningup/exercise2_2_soln.rst","spinningup/exercises.rst","spinningup/extra_pg_proof1.rst","spinningup/extra_pg_proof2.rst","spinningup/keypapers.rst","spinningup/rl_intro.rst","spinningup/rl_intro2.rst","spinningup/rl_intro3.rst","spinningup/rl_intro4.rst","spinningup/spinningup.rst","user/algorithms.rst","user/installation.rst","user/introduction.rst","user/plotting.rst","user/running.rst","user/saving_and_loading.rst","utils/logger.rst","utils/mpi.rst","utils/plotter.rst","utils/run_utils.rst"],objects:{"":{"--ac_kwargs":[25,4,1,"cmdoption-act"],"--act":[25,4,1,"cmdoption-act"],"--count":[24,4,1,"cmdoption-count"],"--cpu":[25,4,1,"cmdoption-cpu"],"--data_dir":[25,4,1,"cmdoption-data-dir"],"--datestamp":[25,4,1,"cmdoption-datestamp"],"--deterministic":[26,4,1,"cmdoption-d"],"--env":[25,4,1,"cmdoption-env"],"--env_name":[25,4,1,"cmdoption-env"],"--episodes":[26,4,1,"cmdoption-n"],"--exclude":[24,4,1,"cmdoption-exclude"],"--exp_name":[25,4,1,"cmdoption-exp-name"],"--hid":[25,4,1,"cmdoption-hid"],"--itr":[26,4,1,"cmdoption-i"],"--legend":[24,4,1,"cmdoption-l"],"--len":[26,4,1,"cmdoption-l"],"--norender":[26,4,1,"cmdoption-nr"],"--num_cpu":[25,4,1,"cmdoption-cpu"],"--select":[24,4,1,"cmdoption-select"],"--smooth":[24,4,1,"cmdoption-s"],"--value":[24,4,1,"cmdoption-y"],"--xaxis":[24,4,1,"cmdoption-x"],"-d":[26,4,1,"cmdoption-d"],"-i":[26,4,1,"cmdoption-i"],"-l":[26,4,1,"cmdoption-l"],"-n":[26,4,1,"cmdoption-n"],"-nr":[26,4,1,"cmdoption-nr"],"-s":[24,4,1,"cmdoption-s"],"-x":[24,4,1,"cmdoption-x"],"-y":[24,4,1,"cmdoption-y"],"default":[26,4,1,"cmdoption-i"],logdir:[24,4,1,"cmdoption-arg-logdir"]},"spinup.utils":{mpi_tf:[28,3,0,"-"],mpi_tools:[28,3,0,"-"]},"spinup.utils.logx":{EpochLogger:[27,1,1,""],Logger:[27,1,1,""],restore_tf_graph:[27,0,1,""]},"spinup.utils.logx.EpochLogger":{get_stats:[27,2,1,""],log_tabular:[27,2,1,""],store:[27,2,1,""]},"spinup.utils.logx.Logger":{__init__:[27,2,1,""],dump_tabular:[27,2,1,""],log:[27,2,1,""],log_tabular:[27,2,1,""],save_config:[27,2,1,""],save_state:[27,2,1,""],setup_tf_saver:[27,2,1,""]},"spinup.utils.mpi_tf":{MpiAdamOptimizer:[28,1,1,""],sync_all_params:[28,0,1,""]},"spinup.utils.mpi_tf.MpiAdamOptimizer":{apply_gradients:[28,2,1,""],compute_gradients:[28,2,1,""]},"spinup.utils.mpi_tools":{mpi_avg:[28,0,1,""],mpi_fork:[28,0,1,""],mpi_statistics_scalar:[28,0,1,""],num_procs:[28,0,1,""],proc_id:[28,0,1,""]},"spinup.utils.run_utils":{ExperimentGrid:[30,1,1,""],call_experiment:[30,0,1,""],setup_logger_kwargs:[30,0,1,""]},"spinup.utils.run_utils.ExperimentGrid":{add:[30,2,1,""],print:[30,2,1,""],run:[30,2,1,""],variant_name:[30,2,1,""],variants:[30,2,1,""]},spinup:{ddpg:[0,0,1,""],ppo:[1,0,1,""],sac:[2,0,1,""],td3:[3,0,1,""],trpo:[4,0,1,""],vpg:[5,0,1,""]}},objnames:{"0":["py","function","Python function"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","module","Python module"],"4":["std","cmdoption","program option"]},objtypes:{"0":"py:function","1":"py:class","2":"py:method","3":"py:module","4":"std:cmdoption"},terms:{"11th":[],"128_ac":25,"13th":20,"1_h32":25,"1_simple_pg":18,"2_rtg_pg":18,"32_seed10":25,"80s":21,"90s":21,"abstract":[15,20],"break":[0,3,12,13,14,18,20,23],"case":[1,9,12,13,14,16,17,18,20,22,25,26,27],"class":[8,15,21,28,30],"default":[9,12,22,24,25,26,27,30],"final":[2,4,16,20,21,22,27],"float":[0,1,2,3,4,5],"function":[0,1,2,3,4,5,9,10,11,13,15,17,18,20,21,25,27,28,30],"import":[11,15,16,17,18,20,22,23,25,26,27],"int":[0,1,2,3,4,5,18,24,25,26,27,28,30],"long":[1,11,18,20,21,22,23,25,26],"new":[1,4,17,18,20,22,23],"public":23,"return":[0,1,2,3,4,5,9,11,12,14,17,18,20,26,27,28,30],"rockt\u00e4schel":20,"short":[18,20,23,25],"super":[15,17,20],"throw":17,"true":[0,11,18,25,27,28,30],"try":[0,3,4,12,16,20,22,25,26],"var":[26,27],"while":[1,2,3,4,16,17,18,20,22,23,27],AIs:16,AWS:20,Adding:11,And:[0,18,20,22,23],Ape:15,Are:[12,15],But:[0,1,2,4,11,13,16,17,18,20,21,24,26,27],CTS:15,Doing:8,For:[0,2,3,16,17,18,20,23,25,26,27,28,30],Its:16,Not:[12,18],One:[0,3,13,15,16,17,18,23],RHS:2,That:[2,16,17,18,20,23],The:[1,2,3,4,5,8,9,10,12,13,14,15,17,18,23,24,25,26,27,28,30],Then:[0,13,18,20,22,27],There:[0,1,12,16,17,18,20,22,23,25,26,30],These:[8,12,16,20,25],Use:[12,17,18,20,26,27],Used:30,Uses:30,Using:[0,1,3,4,5,8,13,15],Will:27,With:[0,2,10,15,22,27],__init__:27,__main__:[25,27],__name__:[25,27],_h128:25,_h300_ac:25,a2c:[17,18,20],a3c:[15,17,20],a_ph:[0,1,2,3,4,5],a_t:18,abbeel:[6,7,15],abil:13,abl:[0,10,16,17,18,21],ablat:[20,30],about:[6,8,12,13,16,17,18,20,21,22,23,26,27,30],abov:4,absenc:[16,21,25],absent:2,absolut:[3,16],ac_kwarg:[0,1,2,3,4,5,25],academ:20,acc:27,acceler:[2,15,20],accept:[16,25,30],access:[0,1,2,3,4,5,16,17,20],accident:1,accord:[0,1,2,3,4,5,16,17,18,21],account:25,accru:18,accur:[4,17],accuraci:27,acer:15,achiam:[7,15,20],achiev:[3,16,20,23],acknowledg:8,acktr:[15,20],acquaint:16,across:[2,4,9,15,24,25,27,28],act:[2,14,16,17,18,25,26],act_dim:[0,1,2,3,4,5,11,16],act_limit:11,act_nois:[0,3],act_ph:18,action:[0,1,2,3,4,5,9,13,14,17,18,20,21,26,27],action_mask:18,action_spac:11,activ:[11,16,22,25,27,28],actor:[3,8,11,12,15,17,20,21,23,25],actor_crit:[0,1,2,3,4,5,21],actrelu:25,acttanh:25,actual:[2,4,11,16,18,20,24,30],adam:[5,20,28],adamoptim:[27,28],adapt:[0,15],add:[0,3,4,11,18,25,26,27,30],add_argu:25,added:[0,3,11,25,30],adding:3,addit:[0,16,28],addition:23,address:[3,15,23],adher:21,adjust:[1,4,16],advanc:[17,20],advantag:[1,4,5,12,15,17,18],advent:21,adversari:15,advic:20,advis:[7,18],after:[0,1,2,3,13,14,16,17,18,20,21,22,23,25,26,27,28],afterward:17,again:[1,13,18,26],against:[12,20],agent:[0,1,2,3,4,5,9,10,12,15,16,17,18,20,21,22,23,26,27],agi:23,agnost:15,ahead:17,aim:[16,18,20,24],alex:[6,20],algo1:24,algo2:24,algo:[4,12,25,26],algo_nam:25,algorithm:[0,1,2,3,4,5,8,9,16,18,20,22,23,24,25,27],align:[15,20],all:[0,1,2,3,9,12,15,16,17,18,20,21,23,24,25,26,27,28,30],allow:[0,1,2,4,13,16,17,18,22,25,30],almost:[2,3,4,11,16,17,18,20,23,28],alon:17,along:[3,12,20,22],alongsid:26,alpha:2,alphazero:[15,17],alreadi:[1,4,5,20,23],also:[0,1,2,4,6,11,16,17,18,20,21,22,23,25,26,27,28,30],altern:[2,15,16,20],although:[1,16,18,21,25],altogeth:21,alwai:[0,1,2,3,4,5,16,17,18,20,21],amanda:6,ambigu:23,amen:25,amend:30,amodei:15,among:21,amount:[1,4,5,20],anaconda:22,analag:14,analys:20,analysi:[1,8,20],analyt:[4,18],andrej:20,andrychowicz:[15,17],anecdot:12,angl:[16,18,20],ani:[0,1,2,3,4,5,12,16,17,18,20,21,22,23,24,25,26,27],announc:23,anonym:15,anoth:[2,3,11,16,17,20,22,26],answer:[4,23],ant:25,anthoni:15,anticip:23,anymor:1,anyon:22,anyth:[12,16,26,27],anywher:[18,27],api:[0,1,2,3,4,5],appeal:[16,18],appear:[12,20,22,25],append:18,appendix:2,appli:[13,15,16],applic:12,apply_gradi:28,approach:[0,2,15,17,18,20,23],approch:20,appropri:[0,1,2,3,4,5,16],approxim:[0,1,2,3,4,5,15,17,18,20,21,23],april:23,apt:22,arang:11,arbitrari:[2,15,16,27],architectur:[9,15,20],area:21,aren:[17,20,23],arg:[0,1,2,3,4,5,24,25,26,27,30],argmax:27,argpars:25,argument:[0,2,3,16,21,22,24,25,27,28,30],argumentpars:25,aris:[3,23],around:[4,22],arrai:[11,18,28],arriv:5,art:20,articl:[18,20],artifact:4,as_list:11,ascent:[0,5,17,18],ask:[23,27],askel:6,aspect:20,aspir:20,assertionerror:26,assum:[0,2,16,18,20,30],assumpt:11,asynchron:[15,17,20],atari:[15,16,20],atom:17,attent:[15,20],attribut:20,augment:[15,17],author:[0,2,8,18],auto:[20,25,30],autocomplet:24,automat:[1,12,24,25,30],autonom:15,auxiliari:15,avail:[12,16,17,20,22,25,30],averag:[0,2,3,4,9,10,11,12,16,18,24,27,28],average_onli:27,averageepret:[24,26],averagetest:27,averagetestepret:24,avert:3,avoid:[4,16,17,20,30],awai:[1,20],awar:[18,20,23],axi:[11,18,24,27],back:[4,13,21],background:[8,12,15,17,18,22,23],backtrack:4,backtrack_coeff:4,backtrack_it:4,backup:[2,11,12,16],bad:[2,4,11,16,17,20],bahdanau:20,ballpark:[12,20],bandwidth:23,bar:12,barri:[],barrier:23,barto:17,base:[0,2,3,5,8,18,20,21,23,25,27,28,30],baselin:[0,1,3,4,8,20,23,28],basi:[18,20],basic:[1,8,14,16,17,20,21,23,25,27],batch:[0,1,2,3,4,5,9,11,12,18,20],batch_act:18,batch_len:18,batch_loss:18,batch_ob:18,batch_ret:18,batch_siz:[0,2,3,18],batch_weight:18,bay:20,bear:18,beat:20,becaus:[0,1,3,4,5,11,12,13,14,16,17,18,20,21,22,23,25,26,30],becom:[1,4,5,11,17,20,23,25],beefi:20,been:[0,2,6,16,17,20,22,23,25,27],befor:[0,2,3,13,15,18,20,25,27],begin:[0,2,3,12,18,23],behav:[17,27],behavior:[0,1,3,11,15,16,20,25,30],behaviour:[1,15],behind:[0,16],being:[0,13,16,17,18,20,23,25],believ:23,bellemar:[15,17],bellman:[0,2,3,11,12,17,21],belong:18,below:[17,22,24,26],ben:6,bench:25,bench_algo1:24,bench_algo2:24,bench_algo:24,bench_ppo_cartpol:25,benchmark:[5,8,15,17,20,22,23],benefit:[1,2,20,21,23],berkelei:[6,7,20],best:[20,23,25],beta:6,better:[0,2,3,16,17,18,20],between:[0,1,2,3,4,5,11,12,13,15,16,17,20,23,24,30],bewar:20,beyond:[1,17,20,23],bia:17,bias:16,big:20,biggest:[1,17,20],bind:28,bind_to_cor:28,bit:[1,2,16,17,20,28],block:18,blog:[20,23],blundel:15,blur:16,bolt:20,bonu:[2,8,12],bonus:2,bool:[25,27,28,30],bootcamp:20,bootstrap:17,both:[1,2,3,4,5,9,16,18,20,21,23,24,27],bottleneck:15,bound:2,boundari:20,box2d:[22,26],boyd:4,bracket:25,brain:[16,20],branch:17,breakthrough:[16,20],brew:22,bridg:[2,15],brittl:[3,20,21],broad:[20,23],broadcast:11,broken:[13,20],brought:15,brundag:6,buckman:15,buffer:[0,2,3,21,26],bug:[20,22,23],bugged_mlp_actor_crit:11,build:[11,16,18,20,21,23,25],built:[8,16,25],bunch:1,bundl:20,burda:15,c51:[15,17],cabi:15,calcul:[0,3,4,13,18,20,21,27],calculu:[18,20],call:[0,8,16,17,18,20,22,25,26,27,28],call_experi:[25,30],callabl:[25,30],can:[0,1,2,3,4,5,8,11,12,13,14,17,18,20,21,22,23,24,25,26,27,30],cancel:11,candid:[17,24],cannot:18,care:[0,2,18,20,21,23,26],carefulli:17,carlo:17,cartpol:[20,25],cash:16,cast:27,castelao:6,categor:[17,18],catherin:20,caus:[1,4,5,17],causal:15,caution:1,ceil:1,center:[6,20],central:[2,16,23],certain:20,cg_iter:4,chain:[13,18,20,23],challeng:[0,8,17,20,23],chanc:[2,13,20],chang:[1,2,3,16,17,18,20,23,25,26,30],channel:15,chapter:[4,5,17,18],charact:[15,16,25],characterist:20,chart:20,check:[1,8,11,12,20,25,26],chess:15,choic:[0,2,12,13,16,17,18,20,23,25,27],choos:[12,16,17,20,30],chose:21,chri:20,christi:6,christiano:15,chung:20,cinch:26,circumst:[17,20],citat:[17,20],claim:[13,14,15,18,20],clark:6,classic:[5,8,17,20,22],classifi:16,clavera:15,clean:[22,27],clear:[5,16,20,23,27],clearli:[13,23],cli:25,click:25,clip:[1,2,3,15,20],clip_ratio:[1,25],clock:12,clone:22,close:[0,1,2,3,4,5,8,21,23,30],closer:23,cloud:20,cluster:17,cmd:25,code:[0,1,4,6,8,15,16,18,20,25],coeffici:[1,2,4],cohort:23,coin:2,coincid:4,colin:6,collaps:[1,4],collect:[0,9,17,18,20],colon:30,color:27,column:[20,24],com:22,combin:[18,23,25],come:[0,2,11,13,16,18,20,22,23,26],comfort:[1,4,12,20],command:[4,8,12,21,22],commit:23,common:[3,16,18,20,23,25],commun:[22,23],compar:[0,5,12,20,24],comparison:[20,23],compat:[6,11,21,23],competit:15,complet:[11,13,16,17,20,21,22,23,25,26],complex:[1,2,20],complic:30,composit:[0,2,3],comprehens:15,compu:[],comput:[0,1,2,3,4,5,11,13,16,17,18,20,21,23,27],compute_gradi:28,concat:11,concept:[8,17,20],conceptu:[18,20],conclud:13,conclus:20,concret:15,concurr:[0,2,3,17,18,20,21],conda:22,condit:[0,1,2,3,4,5,13,14,16,18,27],confer:20,confid:20,config:[24,26,27,30],configur:[25,26,27,30],confound:20,confus:[16,22],conjug:4,connect:[0,2,4,16,17,18,21,23,25],consequ:[16,18,23],conserv:17,consid:[11,12,13,17,18,20,25,26,28],consider:1,consist:[16,17,20,23],constant:[0,11,13,14],constrain:[1,15],constraint:[1,4,16],construct:[18,25,30],contain:[0,2,4,5,11,15,20,21,23,24,26,27,28,30],contest:20,context:[16,17,20],continu:[0,1,2,3,4,5,15,16,18,20,22],contrast:17,contribut:[1,6,15,20,23],control:[0,1,2,4,5,15,16,17,20,22,25],conv:20,conveni:[16,25,30],convent:[0,16,23],converg:[2,15,16],convert:[16,25],convex:4,convolut:[16,20],copi:[0,1,2,3,4,5,18,25,26,27],core:[8,12,16,18,20],correct:[2,4,11,12,20,24,25,26],correctli:[12,18,20,22],correl:0,correspond:[0,1,2,16,17,22],cost:[4,15],could:[2,12,16,18,20],count:[15,20,24,28],counter:15,coupl:[17,20],cours:[0,1,2,3,4,5,9,20],covari:[16,20],cover:[17,18,26],cpo:15,cpu:[20,25,30],crash:26,crater:18,creat:[0,1,2,3,4,5,12,16,17,18,20,22],credit:20,crisp:15,criteria:12,critic:[1,3,4,5,8,11,12,15,16,17,20,21,23,25],critiqu:8,crucial:16,cs231n:20,csaba:20,culmin:21,cumul:[16,20],curat:23,curios:15,current:[0,1,2,3,4,5,16,17,18,20,22,23,25,26,27,28],curriculum:[20,23],curv:[10,11,20,24],custom:25,d_kl:4,d_ph:[11,12],dabnei:[15,17],dai:[13,20,22],dalal:15,damp:[3,4],damping_coeff:4,danger:4,daniel:6,data:[0,1,3,4,15,17,18,20,21,22,24,25,26,27],data_dir:[25,30],dataset:[18,27],date:[25,30],datestamp:[25,30],dd_:25,dd_exp_nam:30,dd_hh:[25,30],dd_ppo_ant_cli0:25,ddpg:[2,8,9,11,15,17,20,21,23,25],deadli:17,deal:[0,4,12,13,16],debug:20,decai:20,decid:[16,17,22,30],decis:[13,14,16,20,23],decompos:13,decreas:1,deep:[2,5,7,12,16,17,18,21,22,23,25],deepli:4,deepmim:15,def:[11,18,27],default_data_dir:[25,26,30],default_shorthand:30,deficit:21,defin:[2,14,16,17,18,21,27],definit:[2,14,16,20,21,22],degrad:[12,20],delai:[0,8,17,21],delta:4,dennison:6,denot:[0,4,5,11,16],dens:[16,27],densiti:[2,15],deped:15,depend:[0,1,2,4,5,6,11,12,14,16,18,20,21,23],depth:18,deriv:[1,8,13],descend:21,descent:[0,1,4,5,9,15,18,20,23],describ:[0,1,4,16,17,18,22,25,27],descript:[0,1,2,3,4,5,15,16,17,18,22,26],deserv:20,design:[3,8,17,20,21,27],desir:[13,17,18,20,27],despit:[5,18,20],detail:[0,1,2,8,11,18,20,23,26,27],deter:20,determin:23,determinist:[2,3,8,9,17,21,26],dev:[2,9,22],develop:[3,7,8,17,23],deviat:[2,16,20,27],devin:6,dexter:15,diagnos:20,diagnost:[21,27],diagon:12,diayn:15,dict:[0,1,2,3,4,5,25,26,27,30],dictionari:27,did:[2,15,22],didn:13,differ:[1,2,3,4,10,11,12,15,16,18,20,23,24,25,26],differenti:[0,2],dig:[16,23,26],digest:[17,23],dimens:[3,11],dimension:[1,4,5,15,16],direct:2,directli:[0,4,16,17,20,21,25,30],directori:[8,24,25,27,30],disabl:30,discard:17,disclaim:17,discount:[0,1,2,3,4,5,14,16,18],discourag:20,discov:20,discoveri:15,discret:[0,1,2,4,5,16],discrimin:15,discuss:[16,18,23,25],disguis:20,dishearten:20,distanc:4,distil:[15,17],distinct:[13,16],distract:8,distribut:[0,2,3,4,12,13,16,17,18,20,23,26],dive:[4,20],diverg:[1,4],divers:15,doc:[8,23,25,26,28],docstr:25,document:[8,15,16,22,23,25,26,27,28,29],doe:[0,1,2,3,4,13,16,18,20,26],doesn:[0,1,4,11,13,18,20,26],doing:[1,4,20,21],domain:20,don:[4,8,11,12,16,17,20,21,22,23,26],done:[2,11,18,20,23],donoghu:15,dopamin:15,dota:16,doubl:[2,3,15,26],down:[0,4,5,13,18,26],download:22,downsid:17,dpg:[0,15],dqn:[0,15,17,20],dramat:[1,3],drastic:12,draw:17,drawn:2,dream:17,drive:20,driven:15,drop:[13,18],dropout:20,dry:22,dtype:[16,18],dualiti:4,duan:[5,15,20],due:[2,4],duel:15,dump_tabular:27,dure:[4,17,18],dylan:6,dynam:15,each:[0,1,2,3,4,5,8,11,12,13,16,17,18,20,21,23,24,25,26,27,28,30],earli:[1,10,13,15,18,20,21],earlier:[0,5,15],eas:18,easi:[17,20,23,25,26,27,28],easier:[1,16,17,20,23,30],easiest:4,easili:[24,25,26,27],educ:23,effect:[12,17,18,20],effici:[0,4,15,17,20,21],effort:[12,17,20],eglp:[13,18],either:[0,1,2,3,4,5,16,17,20,23,24,25,30],element:[20,21,27],elementwis:16,els:[11,18,20,27],elu:25,emb:17,embed:[15,17,27],embrac:20,emerg:[1,15],emphasi:20,empir:[1,15,18,20,21],emploi:0,empow:23,empti:[18,26],enabl:25,encod:[18,20],encompass:17,encourag:[1,4,5,23],end:[1,18,20,23,26,27],endless:23,endow:17,enforc:[16,20],engag:20,engin:[20,22,23],enjoi:[16,18,20],enough:[0,3,11,18,20,21,22,23,25],ensembl:15,ensur:[1,2,23,25],enter:20,entir:20,entireti:[12,14,18],entri:[4,11,16,21,23],entropi:[15,17,20],env:[12,18,22,25,26],env_fn:[0,1,2,3,4,5,25,30],env_nam:[25,30],environ:[0,1,2,3,4,5,8,11,12,15,16,17,18,20,21,22,27,30],ep_len:18,ep_ret:18,ep_rew:18,episod:[0,1,2,3,4,5,12,15,16,18,20,23,26],eplen:26,epoch:[0,1,2,3,4,5,9,12,18,25,26,27],epoch_logg:27,epochlogg:[0,1,2,3,4,5,27],epret:26,epsilon:20,equal:[4,14,18,20,27],equat:[17,18,20,21],equival:[1,2,4,5,15,17,18,25],ermon:15,error:[0,2,3,4,11,15,16,18,23,25],escap:25,especi:[4,17,20,22],espeholt:15,essai:[12,17,23],essenti:[3,16,25,27],establish:[0,15,20],estim:[0,1,2,3,4,5,12,15,16,17,18,20],etc:[8,17,26],eval:25,evalu:[0,2,4,12,18,20,26],even:[0,4,17,18,20,25],evenli:2,ever:[0,16,18,26],everi:[0,2,3,9,13,16,20,23,25],everyon:23,everyth:[0,14,20,22,26,27],everywher:11,evolut:15,evolv:20,ex2:[12,15],exact:23,exactli:[3,4,18,20],examin:18,exampl:[11,13,15,16,17,18,20,22,23,25,30],exc:24,except:[2,12,20,22,23,25,28],excerpt:[11,12],excit:[20,23],exclud:24,exclus:24,execut:[17,22,25,30],exemplar:15,exercis:[4,8,23],exercise1_1:12,exercise1_1_soln:12,exercise1_2:12,exercise1_2_soln:12,exercise1_3:12,exercise2_2:12,exhaust:[0,6,17],exist:[17,20],exit:[15,17],exp_nam:[12,22,24,25,27,30],exp_name_:30,expand:[2,4,13],expans:[4,15,17,20],expect:[2,5,8,11,13,14,16,17,20,22,26,27],expens:[0,4],experi:[0,2,8,12,15,17,18,20,21,23,24,27],experienc:0,experiment:25,experiment_nam:25,experimentgrid:8,expert:[17,23],expertis:23,explain:[0,2,12,18,20],explan:[16,22],explicit:17,explicitli:[2,17],exploit:[17,21],explor:[7,8,17,20],exponenti:16,expos:[17,20,21,23],express:[1,4,13,14,16,18],extens:[17,18,22],extent:12,extrem:[17,18,22],eye:20,eysenbach:15,face:20,facebook:15,facilit:[0,3,15,25],fact:[16,18,25],facto:22,factor:[0,1,2,3,4,5,12,15,16],fail:[11,17,20,26],failur:[3,8,17,20],fair:[0,16,20],fairli:[0,20],fals:[0,18,27,28,30],famili:[1,15,16,17,21],familiar:20,famou:17,famous:16,fantast:23,far:[1,4,15,16,17,18,20],farquhar:20,farther:[1,13],fast:[15,20,26],faster:18,favor:21,featur:[2,21,26],feed_dict:[18,27],feedback:[6,23],feedforward:18,feel:[1,4,12,18,20,25],feinberg:[15,17],fellow:23,fend:3,fernando:15,fertil:20,feudal:15,few:[1,12,13,16,17,18,20,23,25,26,28,30],fewer:1,fictit:17,fictiti:17,fidel:15,field:[15,17,20,21,23],figur:[1,12,17,20,22,24],file:[12,24,25,26,27,30],filepath:27,find:[0,3,12,15,18,20,24,25,26],fine:[15,17,20],finish:[12,20,22],finished_rendering_this_epoch:18,finit:[0,2,5,14,16,18],finn:15,first:[0,1,2,3,14,16,17,18,20,22,23,27,30],fit:[0,17],five:9,fix:[0,2,3,14,16,17,18,23,24],flag:[12,20,24,26],flatten:[1,4,5],float32:[16,18,27],flood:26,flow:15,focu:[1,20],focus:20,foerster:20,folder:[25,26,30],follow:[0,2,12,14,15,16,17,18,20,21,22,23,26,27],follw:22,forc:[25,30],force_datestamp:[25,30],forego:[16,17],forev:16,forget:26,form:[1,2,3,8,13,14,20,27,30],formal:8,format:[8,26],formul:[15,16,18],formula:[5,12,18],forward:20,found:[0,1,4,5,18,22,23],foundat:[4,15,17,18,21],four:16,fpath:27,frame:20,framework:[15,16,17,23],free:[8,12,16,18,22],freecodecamp:22,frequenc:27,frequent:[3,12,16,20],fresh:2,frill:20,from:[0,1,2,3,4,5,8,9,11,13,14,15,16,17,18,20,21,22,23,24,26,27,28,30],frozenlak:20,frustrat:20,fujimoto:[3,15,17],full:[20,22,25,27,30],fulli:[16,21],fundament:[17,23],furthermor:4,futur:[2,13,14,15,16,17,23],gae:[1,4,5,15,18],gail:15,gain:[17,20,21],game:[0,16],gamma:[0,1,2,3,4,5,11,12,22],gan:15,gap:[0,1,2,3,4,5,15,23],garfinkel:6,gate:20,gauci:15,gaussian:[0,2,3],gave:6,gce:20,gcl:15,gener:[0,1,2,3,4,5,12,15,17,18,20,23,25,27,28,30],get:[0,1,2,3,4,5,6,10,11,12,16,17,18,20,21,22,23,24,25,26,27,28],get_act:26,get_stat:27,git:22,github:[18,22],give:[0,1,2,3,4,5,6,13,16,17,18,20,23,25,27,30],given:[0,1,2,3,4,5,6,11,12,16,17,18,24,25,27,30],glanc:1,global_step:28,global_variables_initi:27,goal:[12,15,16,17,20,25],goe:[1,18],going:[1,12,13,20,22],gone:17,good:[0,12,16,18,20,21,22,23],googl:20,govern:16,gpu:20,grace:27,gracefulli:26,grad:[8,14,28],gradient:[1,4,8,9,12,13,16,17,20,21,23,27,28],grads_and_var:28,gradual:20,grand:20,graph:[0,1,2,3,4,5,8,9,11,21,24],grappl:1,grate:6,gratefulli:6,great:[3,20,21],greedi:20,green:27,greff:20,greg:[6,20],gregor:15,grid:[25,30],ground:[0,6,17],group:[6,20,25,27],grow:[1,20,23],gru:20,grusli:15,guarante:[18,21],guid:[15,16,20],gym:[0,1,2,3,4,5,9,12,20,21,22,25,26,30],haarnoja:[2,15,17],habit:20,hack:26,hackathon:20,hacki:28,had:18,hadfield:6,half:2,halfcheetah:[11,12,20],halfwai:20,hall:6,hammer:20,hand:[15,16,26],handi:24,handicap:20,handl:[1,2,25,26,27,30],happen:[0,3,13,16,17,20,21,23],happi:4,hard:[1,4,16,17,20,23],harder:[3,20],hardest:[12,20],hardwar:20,has:[0,1,2,3,4,5,7,11,16,17,18,20,22,23,25,26,27],hash:15,hasn:[20,22],hasselt:15,hausknecht:15,hausman:15,have:[0,1,2,3,4,6,11,12,13,16,17,18,20,21,22,23,24,25,26,27,30],haven:20,head:2,hear:23,heard:20,heess:[1,15],help:[0,1,2,3,6,14,20,22,23,25,26,30],henc:[3,17],henderson:[15,20],her:[15,17],here:[0,1,2,4,6,12,13,16,17,18,20,22,23,25,26,27,30],hertz:20,hesit:6,hessel:15,hessian:4,hex:17,hid:[22,25],hidden:[12,16,25],hidden_dim:16,hidden_s:[11,18,25,27],hide:23,hierarch:15,hierarchi:[8,20],high:[1,2,4,5,11,15,16,20,21,23,25],higher:[0,2,3,5,16,17],highest:[2,16],highli:[0,18,23],highlight:[0,16,17,18,27],hindsight:[15,17],hint:[12,16],hirl:15,hiro:15,histori:[16,21],hit:[1,22],hobbyist:23,hold:[14,20],homebrew:22,hood:[20,25],hopefulli:20,hopper:[10,12,25],horgan:15,horizon:[2,5,14,15,16,17,18,20],horovod:28,hour:20,houthooft:15,how:[0,1,2,3,4,5,8,12,15,16,17,18,20,22,24,26,27],howev:[3,11,16,23,25,30],http:22,huge:20,human:[6,15,16],humanoid:20,hundr:20,hurdl:20,hurt:4,hyperparamet:[0,1,3,4,9,12,20,26,27,30],hypothes:20,i2a:[15,17],icm:15,idea:[5,16,20,21,22],ideal:20,ident:[15,18,24],idx:27,ignor:17,illustr:[2,25],ilya:15,imag:[20,21],imagin:[15,17],imit:8,immedi:[0,14,16,17,18,23],impact:[12,20],impala:15,implement:[8,11,15,16,17,20,21,23,25,26,27],impli:18,implicit:15,importantli:[2,23],imposs:26,impress:20,improv:[0,1,2,3,4,6,15,17,18,20,21],in_nam:[25,30],inaccur:18,inadvert:23,inadvertantli:[],incent:1,includ:[0,1,2,3,4,5,6,8,15,16,17,18,20,22,23,25,27,30],inclus:[16,30],incompat:[17,25],incorpor:2,incorrect:3,increas:[1,2,4,20,27],incred:20,increment:20,incrementalist:20,independ:[2,18],index:[8,16],indic:[0,16,30],indirectli:17,individu:25,infinit:[2,5,14,16,18],influenc:20,info:[4,18,27],info_ph:4,inform:[15,16,17,20,21,22,23,26,27],inher:2,initi:[1,4,5,20,21,27],inner:[13,14],inner_prefix:25,innermost:[3,13],input:[0,1,2,3,4,5,12,16,27],insid:13,insight:[20,21],inspir:20,instabl:17,instal:8,installtest:22,installtest_s0:22,instanc:[16,18,20,26],instanti:21,instead:[0,1,2,3,4,12,16,17,20,23,24,25,30],instruct:[12,16,22,23],instrument:20,int32:[18,27],integ:4,integr:20,intend:[20,23],intens:17,intent:15,interact:[0,1,2,3,4,5,9,16,17,18,21],interactivesess:11,interest:[18,20,23],interestingli:15,interfac:26,interleav:[0,20],intermedi:[4,18],intern:[20,24,27],interpol:[0,2,3,15,17],interpret:[18,24,25,26],intervent:15,intro:[8,20],introduc:[2,3,4,15,16,20],introduct:[0,5,8,12,16,17,20],intuit:[1,16,18,20],invent:20,invers:[2,4,8],invertedpendulum:[12,20],investig:20,involv:[6,11,17,18,23],ioff:20,ipg:15,ipython:26,iqn:15,irl:15,irpan:20,islam:[15,20],isn:[2,4,27,30],issu:[1,3,4,11,16,20,21,22,23],iter:[4,14,17,20,26,27],itr:[26,27],its:[0,1,2,3,4,5,16,17,18,22,27],jack:6,jaderberg:15,jakob:20,job:21,jog:13,john:20,joint:[13,16],jointli:20,jona:6,josh:[7,20],joshua:20,json:[24,26,27],jump:[15,23],just:[0,3,4,14,16,18,20,22,25,26,27,30],justif:15,kakad:[4,15],kalashnikov:15,karpathi:20,keep:[0,1,3,4,20,26,27],kei:[8,17,18,20,21,23,25,26,27,30],kera:27,keyword:[0,1,2,3,4,5,25,27],khan:6,kick:1,kind:[1,3,4,8,16,18,20,23],kingma:20,know:[11,20,23],knowledg:[20,23],known:[12,26],krogh:20,kroneck:15,kurutach:15,kwarg:[0,1,2,3,4,5,25,26,27,28,30],l62:18,l86:18,l99:18,lag:0,lagrangian:4,lai:18,laid:23,lam:[1,4,5],lambda:[1,4,5,25],land:16,landscap:17,langford:[4,15],languag:16,larg:[0,1,4,15],largest:4,larissa:6,last:[12,13,14,16,18,26],lastli:[3,4],late:21,later:[0,2,13,15,16,18,20,26],latest:[1,4,5,26],latter:0,launch:[8,12,17,20,23,26,28,30],law:[14,16],layer:[15,16,20,25,27],lead:[2,3,4,5,15,16,18,20,21],leak:30,leakag:27,leandro:6,learn:[1,3,4,5,7,8,10,11,12,16,18,21,22,23,25,26],learner:[15,20],least:[1,20,22],leav:[11,14,15,22,27],lectur:23,led:[5,21],left:[9,13,30],legend:24,lemma:[8,13],len:[18,26,27],length:[0,1,2,3,4,5,12,18,20,26],less:[1,3,4,5,17,20],lesson:20,let:[0,1,2,4,5,8,11,14,16,20,22,25,26,27],letter:30,level:[16,20,21,23],lfp:15,liang:15,libopenmpi:22,librari:[15,20,22,25],licens:[22,26],lie:3,life:18,like:[0,1,2,4,12,13,16,17,18,20,21,22,23,24,25,26,27],likelihood:16,lilian:[6,20],lillicrap:[0,15,17,20],limit:[1,3,4,25],line:[4,8,9,11,16,18,20,21,23,26,27],linear:16,link:[8,15,28],linux:22,list:[6,11,15,17,18,20,23,25,30],liter:18,literatur:[16,18,20,23],littl:[2,12,16,17],liu:15,live:[16,17,23],load:[0,1,2,3,4,5,8,23],load_data:27,load_polici:26,local:[1,2,4,5,17,20,25,27],locat:[8,25],lock:23,locomot:[1,15],log:[1,2,4,5,8,14,16,21,23,24,26],log_prob:18,log_softmax:18,log_tabular:27,logdir:24,logger:[0,1,2,3,4,5,8,21,26,30],logger_kwarg:[0,1,2,3,4,5,25,27,30],logic:[13,21],logit:[16,18,27],logp:[1,4,5],logp_pi:[1,2,4,5],logprob:18,logx:27,longer:20,look:[1,12,15,20,22,24,26,27,28],loop:[16,17,18,21,27],lose:[16,20],loss:[0,1,2,3,11,12,18,20,21,27,28],lost:27,lot:[12,17,20],low:[2,18],lower:[2,5],lstm:20,lucid:5,lunarland:[22,25],machin:[12,20,23,25,30],macro:15,maddi:6,made:[13,21],mahmood:15,mai:[0,1,2,3,4,5,6,12,16,17,20,22,24,30],main:[0,1,2,3,4,5,16,17,18,20,21,27],maintain:27,mainten:30,make:[0,1,2,3,4,5,10,12,13,16,17,18,20,21,22,23,25,26,27,28,30],maml:15,manag:[21,22],mani:[6,12,15,16,17,18,20,22,23,24,25,26,27,28,30],mania:15,manipul:15,manner:26,map:[15,16,26,27],margin:13,markov:[14,16],mask:[11,18],mass:2,master:15,masteri:20,match:[23,24,27],materi:[6,17,18,20,23],math:[0,1,4,16,18,20,23],mathemat:[11,13,16,18,20,21],matl:15,matrix:[4,11,16],matter:[4,15,16,20,23],matthew:20,matthia:6,max:[0,1,12,20,27,28],max_ep_len:[0,1,2,3,4,5],maxim:[0,1,2,3,15,16,17,18],maximum:[0,1,2,3,4,5,12,15,26],maxtest:27,mayb:30,mbmf:[15,17],mbve:17,mdp:[15,16],mean:[0,1,2,3,4,5,9,12,16,17,18,20,26,27,28],meaning:20,meant:[16,26],measur:[2,4,9,16,18,20,24],mechan:20,memori:[8,20],menel:6,mention:0,merg:15,merit:20,merlin:15,mess:11,messag:27,meta:[8,17,20],metamim:15,method:[1,3,4,5,12,15,16,17,18,20,21,25,27,28],methodolog:15,metric:[9,18,21,26,27],mfec:15,middl:23,might:[0,11,12,16,18,20,22],mile:6,million:4,min:[1,20,27,28],mind:20,minibatch:[0,1,2,3,9],minim:[0,2,3,18,23,27],minimum:2,mintest:27,minut:[12,20,22],mirag:15,mishra:15,miss:23,mission:8,mitig:21,mlp:[11,12,16,18,20,21,27],mlp_actor_crit:[0,1,2,3,4,5,11],mnih:[15,17,20],mnist:27,mode:[3,8,17,20],model:[8,20,21,26,27],model_info:26,modern:[5,17],modif:[4,18,23,28],modifi:26,modul:[8,15,23,26],modular:[17,20],modularrl:[1,4],momentum:20,monoton:[4,15,20],mont:17,month:[20,23],more:[0,1,2,4,12,16,17,18,20,23,24,25,26,28],morn:13,most:[0,14,16,17,18,20,21,23,25,26,27],mostli:16,motiv:[0,1,4,20],motor:15,move:[16,20,23],mpc:17,mpi:[1,4,5,8,23,25,30],mpi_avg:28,mpi_fork:28,mpi_statistics_scalar:28,mpi_tf:28,mpi_tool:28,mpiadamoptim:28,mpo:15,msbe:[0,2],msg:27,much:[0,1,3,16,17,18,20,21],mujoco:[8,9,12,21],mujocotest:22,mujuco:[],multi:[20,23],multinomi:[16,18],multipl:[1,13,16,24,26,27,30],multipli:11,multitask:8,multivari:[16,20],must:[0,1,2,3,4,5,16,18,25,30],mutual:15,mve:15,n_act:18,nachum:15,nagabandi:[15,17],name:[6,16,18,20,24,25,26,27,28,30],nameofquant:27,narrowli:20,natur:[4,15,16,20,23],nec:15,necessari:20,necessarili:23,need:[0,2,4,13,15,16,18,20,21,22,23,24,26,27],neg:[1,18],nest:30,net:[15,16,20],network:[0,2,3,4,5,9,11,12,15,16,17,18,20,25],neural:[0,2,4,5,12,15,16,17,18,20,25],neutral:18,never:[12,17,20],next:[0,3,16,20,21,23,27],nice:[4,25,26],nois:[0,2,3,9,16,18],noise_clip:3,non:[0,11,12,17,21,23],none:[11,16,18,25,26,27,28,30],nonetheless:23,nonneg:[4,16],nonzero:18,norend:26,norm:20,normal:[0,2,3,4,16,18,20,23,28],notabl:16,notat:[0,14,16,17],note:[0,1,2,16,24,25,26,27,30],noth:[18,25],notion:16,nov:23,novemb:22,novic:20,now:[2,13,16,17,18,20],npg:4,num_cpu:[25,30],num_proc:28,num_run:25,num_sampl:18,number:[0,1,2,3,4,5,6,16,17,18,20,23,24,25,26,27,28,30],numer:[4,18,27],numpi:[11,27],nut:20,nutshel:16,obei:16,object:[1,4,16,17,18,20,21,23,25],obs:[16,18],obs_dim:[16,18],obs_ph:18,obscur:23,observ:[0,12,15,17,18,21,23,27],obtain:[0,2,13,16,17,18],obviou:23,octob:20,odd:[13,20],odyssei:20,off:[0,1,2,3,6,9,15,16,17,20,23,24],often:[0,1,2,3,4,5,16,18,20,25],olah:20,old:[0,1,4,18,21,26],older:26,olsson:20,omit:[16,17,23,30],onc:[0,1,3,4,9,20,22,27],one:[2,3,12,16,18,20,21,22,23,24,25,26,30],one_hot:[18,27],ones:[17,20,25],onli:[0,1,2,3,4,11,12,13,14,16,17,18,20,22,23,24,25,26,27],open:[15,20],openai:[0,1,2,3,4,5,7,12,15,20,22,23,25],openmpi:8,oper:4,oppos:20,ops:21,opt:15,optim:[0,2,5,8,15,17,20,21,28],optima:[1,4,5],optimum:[2,15],option:[8,13,15,17,18,20,24,26,27,28,30],order:[0,1,4,6,12,16,20,21,22,23],organ:[20,22,23,26],orient:20,origin:[0,1,2,4,13,20,23,28],orthogon:17,ostrovski:15,osx:22,other:[8,9,12,16,17,21,23,26,27],otherwis:[5,6,20,22,23,24,25,27],ouput:11,our:[0,1,2,3,4,5,8,12,13,16,17,18,26],out:[1,2,3,6,11,12,13,14,17,18,20,21,22,23,24,25,26,27],outcom:[2,18],outdat:0,outer:[11,13],outer_prefix:25,outlin:20,output:[0,1,2,3,4,5,8,12,16,17,18,24,25,27],output_activ:[11,27],output_dir:[25,27,30],output_directori:[24,26],output_fnam:27,output_typ:27,outsid:13,outward:3,over:[0,1,2,3,4,5,9,10,11,12,13,14,16,17,18,20,24,25,27,28],overal:20,overestim:3,overfit:[0,18,20],overview:20,overwhelm:22,overwrit:[26,27],own:[16,20],packag:[21,22,23,25,26],page:[2,8,9,12,17,18,22,23,25,26,27,28,29],pai:17,pain:[2,15,23,26],painfulli:[0,4],painless:23,pair:[0,1,2,3,4,5,16,18,30],paper:[8,17,18,20,23],paradigm:23,parallel:[0,1,2,3,4,5,15,20,25,27],param:[25,28,30],param_nam:25,paramet:[0,1,2,3,4,5,16,17,18,20,21,24,25,27,28,30],parameter:[2,16,18],parisotto:15,pars:23,parse_arg:25,parser:25,part:[0,3,5,8,11,12,20],partial:[15,16,20,26],particular:[3,6,16,18,20,21,25],particularli:[1,17,26],partli:11,pass:[21,25,26,27,30],past:[8,14,20],path:[12,20,24,25,26,30],pathak:15,pathnet:15,patient:22,pattern:23,pcl:15,peak:3,peek:20,penal:1,penalti:[1,15],peng:15,peopl:[6,20,23],per:[0,1,4,5,9,15,25,27],perceiv:16,perfect:20,perfectli:0,perform:[0,1,2,3,4,5,8,12,15,17,18,20,21,23,24,25,26],period:21,person:[20,22,23],perspect:[15,20,23],peter:[6,15],pg_math:18,pgql:15,phd:7,philosophi:8,physic:[15,16,22],pi_loss:[11,12],pi_lr:[0,1,3,5],pick:[16,24],pickl:26,piec:[18,23,30],pieter:[6,7],pip:22,pipe:20,pixel:16,pixelcnn:15,pkl:26,place:[16,17,20],placehold:[0,1,2,3,4,5,12,16,18,21,27],plai:[4,11,15,16,17,20,23],plan:[8,17],plant:20,plappert:6,platform:15,plausibl:20,pleas:[6,16,22,26],plot:[8,12,22,25,29],plotter:[8,24,26,27],plu:[2,16,20,30],plug:18,point:[0,2,6,13,15,16,17,18,20,23,26],polici:[2,3,8,9,13,17,20,22,24,25],policy_delai:3,polyak:[0,2,3],popular:17,pose:0,posit:[1,4,24],possibl:[0,1,4,12,16,17,20,22,23,25,26,27],post:23,potenti:[17,20,21,30],power:23,ppo:[1,2,9,15,16,17,18,20,21,22,25],ppo_ant:25,ppo_ant_cli0:25,practic:[16,18],practition:[18,23],pre:26,preced:14,precis:16,precommit:20,predat:21,predict:[15,17,27],prefer:[0,15,16,22,27],prefix:24,prematur:2,prepar:[17,20,27],preprocess:27,preregistr:20,prescrib:20,presenc:16,present:[1,6,17,23,26],presum:[0,30],pretti:[1,3,16,18,20,23],prevent:[2,20,22,26,27],previou:[0,18,20],previous:[0,1,2,3,4,5,7,12,27],primari:[1,17],primarili:7,principl:[15,17],print:[26,27,30],prior:[16,20],priorit:15,pritzel:15,prob:[8,14],probabl:[0,1,2,3,4,5,13,16,18,20,22],problem:[0,1,2,4,8,15,17,18,20,22,26],problem_set_1:12,problem_set_1_solut:12,problem_set_2:12,problemat:[0,21],proc:30,proc_id:28,proce:13,procedur:[1,4,5,17,27],process:[11,12,14,16,20,25,26,27,28,30],produc:[4,11,17,23,24,25,26,28],product:[4,11,16],profess:23,profit:1,profound:16,program:[0,1,2,3,4,5,20],progress:[1,4,5,10,12,15,20,21,22,23,26,27],project:[6,8,26],promin:17,promis:20,prop:15,properti:16,proport:[2,18],propos:20,proprietari:22,protocol:26,prove:[4,13,16,20],provid:[0,1,2,3,4,5,15,16,24,25,27],proxim:[8,15,17,20,21],pseudocod:[12,23],pseudocount:15,publish:[2,7,21,23],pull:[13,14],punish:16,pure:17,purpos:[15,18,27,28],push:[5,18,20],put:[0,1,11,12,16,18],python3:22,python:[0,8,12,24,25,26,30],pytorch:20,q1_pi:[2,3],q2_pi:2,q_loss:[11,12],q_lr:[0,3],q_pi:[0,11,12],q_pi_targ:[11,12],qualiti:[0,3,12],quantil:[15,17],quantiti:[2,4,27],quantity_valu:27,question:[1,16,17,23],quickli:[3,4,10,12,20,22,23],quit:[10,16,17,20],r2d2:15,r_ph:[11,12],rahtz:20,rai:[0,1,5,6],rainbow:15,rais:[16,18,25],ramp:23,ran:26,randint:27,random:[0,1,2,3,4,5,9,10,11,12,13,15,18,20,21,24,25,27,30],random_norm:16,randomli:16,rang:[0,2,3,17,18,20,23,25,27,30],rank:[27,28],rate:[0,1,2,3,4,5],raw:[16,20],reach:[6,12,26],reactor:15,read:[15,18,20,22,23],readi:[4,20],readm:22,readout:25,readthedoc:26,real:[0,2,3,8,16,17],realli:[17,18,20,24],rearrang:18,reason:[0,1,16,17,20,21,23,26],recal:[18,24],recap:[0,8],receiv:[23,25],recent:[0,5,14,16,17,18,21,26,27],recip:20,recognit:20,recommend:[0,3,18,22,25],record:[18,26,27],recreat:26,recurr:[15,20,21],recurs:24,reduc:[0,1,3,18,20],reduce_mean:[11,12,18,27],reduce_sum:18,reevalu:15,refer:[8,15,16,18,22,23,24,25],referenc:12,reflect:[16,21],refresh:18,regardless:[17,27],region:[8,15,17,21],regress:[3,15,17],regular:[1,3,17,20,23],reimplement:[12,20],reinforc:[0,4,5,7,8,13,16,18,20,23],reinvent:20,rel:[4,12,16,17,18,25],relat:[0,7,15,16,20,21],releas:[0,2,3,6,23],relev:[21,23,27],reli:[1,13,24],reliabl:[17,20,21],relu:[9,11,25,27],remaind:14,remov:[1,2,18,20,23,24],render:[18,26],reparameter:[2,20],repeat:16,repeatedli:18,replai:[0,2,3,15,17],replay_s:[0,2,3],repo:[0,2,3,23],report:[9,20,23,24,27,30],repositori:15,repres:[2,16,17,18,21,26],represent:17,reproduc:[8,20],requir:[2,4,20,22,23],research:[7,8,15,18,21,22,23,25],reset:[15,18],reshap:[11,18,27],residu:20,resnet:20,resort:20,resourc:[8,23,25],respect:[0,2,3,4,13,14,17,18,20],rest:[2,14,17,18],restor:[26,27],restore_tf_graph:[0,1,2,3,4,5,26,27],result:[0,2,3,4,8,11,12,13,14,15,16,17,18,20,22,23,26,27,29,30],resum:26,retro:20,reus:[11,17,20,21],reveal:15,revers:18,review:[8,12,17,23],rew:18,reward:[0,1,2,4,5,8,11,13,14,15,17,20,21],reward_to_go:18,rework:16,rewrit:[2,14],rgb:16,rich:[1,15],richer:17,right:[0,2,8,16,18],rigor:8,risk:20,rllab:[0,1,4,5,15,20,23,30],rllib:[0,1,5,15,23],rmc:15,rnd:15,rnn:20,roadmap:23,robot:[15,16,22],robust:20,roi:[15,17],role:23,rollout:[0,1,2,3,4,5,16,26],root:15,rothfuss:6,rough:20,roughli:[0,1,2,23],row:11,rtg:18,ruder:20,rule:[0,1,2,4,5,13,16,18,20,24],run:[0,1,2,3,4,5,8,9,11,12,16,18,20,21,22,23,24,27],run_entrypoint:30,run_kwarg:25,run_polici:26,run_util:[25,30],rusu:15,s_t:18,sac:[2,9,15,17,21,23,26],safe:[7,15,23],safeti:[7,8,20,23],sai:[0,1,2,12,13,16,22,24],said:[17,23],salakhutdinov:15,saliman:[15,20],same:[0,1,2,3,4,11,12,13,14,16,18,20,24,25,26,27,28],sampl:[0,1,2,3,4,5,12,15,16,17,18,20,21,27,28],santoro:15,satisfi:[0,1,2,3,4,5,17,21],saunder:15,save:[8,18,21,23,30],save_config:27,save_freq:[0,1,2,3,4,5,26,27],save_st:[26,27],saved_model:26,savedmodel:26,saver:26,scalabl:15,scalar:28,scale:[0,1,2,3,8,20,25],schaal:15,schaul:15,scheme:20,schiavo:6,schmidhub:[15,17],scholar:23,schulman:[1,4,5,15,17,20],scienc:[20,22,23],scientif:[20,23],scientist:7,scope:20,score:[12,17],scour:20,scratch:20,screen:26,script:[8,27,28],search:[4,8,15,17,20,24],sebastian:20,second:[0,1,16],section:[0,12,14,17,18,20,21,25,26],see:[0,1,2,3,4,9,11,12,16,17,18,20,21,22,23,24,25,26,27,28,29],seed0:24,seed10:24,seed:[0,1,2,3,4,5,9,10,11,12,20,21,24,25,27,30],seem:[1,18,20],seemingli:4,seen:[1,18],sel:24,select:[0,1,2,3,4,5,16,17,18,24],self:[15,16,17,23,30],send:18,sens:[16,17,18,20,23,26],separ:[0,11,13,20,23,24,26,27,30],septemb:17,sequenc:[16,20,30],serendipit:17,seri:[20,23,25],serial:[26,27,30],seriou:23,serv:[1,3,8,15,25,27],servic:[6,15],sess:[11,18,27],session:[21,27],set:[0,2,3,4,5,8,9,14,16,18,20,21,23,24,26,27,30],setup:[21,26,27],setup_logger_kwarg:30,setup_tf_sav:27,sever:[15,17,20,23,24,25,27],sgd:[0,1,2,3,20],shade:9,shape:[0,1,2,3,4,5,11,16,18,27],share:[16,23,24,25],sharp:3,shift:20,ship:[20,24,25,26,27,30],shogi:15,shorten:[14,30],shorter:[17,22],shortest:20,shorthand:[0,16,25,30],shot:15,should:[6,12,13,15,20,23,30],show:[0,3,5,9,13,14,15,18,24],side:[16,17,18],sidestep:4,signal:[0,3,16,18,20],signific:[12,20],significantli:[1,20],silent:20,silver:[0,15,17],similar:[3,20,23,25],similarli:[21,23],simpl:[0,1,15,16,18,20,21,22,23,24,25,27],simple_sav:26,simpler:[0,1,20,30],simplest:[8,13,20],simpli:[17,25],simplic:20,simplif:20,simplifi:[1,16],simul:[16,22],simultan:[15,26],sinc:[0,4,25],singl:[1,3,4,16,18,20,21,23,25,27],situat:26,six:[12,23],size:[0,2,3,4,9,11,12,16,18,25],skill:[15,20,23],slide:20,slight:18,slightli:[2,4,18,20,26,30],slow:[0,4,15,26],small:[1,4,17,20,25],smaller:[3,12],smallest:4,smallish:4,smooth:[2,3,24],snag:22,snail:15,snapshot:26,snippet:16,soak:25,soft:[8,15,17,21,23],softmax:16,softmax_cross_entropi:27,softwar:[6,22],soil:20,solid:[9,20],solut:[0,4,12,25],solv:[0,1,4,15,20,23],some:[0,3,4,11,16,17,18,20,21,22,23,24,25,26,27],someon:15,somerandomnumb:27,someth:[2,4,16,17,20,23,26,27,28],sometim:[3,12,16,18,20,26],somewher:20,sonic:20,soon:21,sophist:[16,23,28],sort:[20,25],sota:[20,21],sourc:[0,1,2,3,4,5,15,22,27,28,30],space:[0,1,2,3,4,5,15,17,20,21],speak:2,special:[1,4,16,20,26,30],specif:[0,11,12,16,18,20,21,23,30],specifi:[4,24,25,26,27],spectrum:17,speedup:25,spend:20,spent:20,spheric:16,spin:[0,1,2,3,4,5,6,7,12,18,21,23,24,25,26,27,30],spinningup:[22,25,26,30],spinup:[0,1,2,3,4,5,12,18,22,24,25,26,27,28,30],split:[13,21,28,30],squar:[0,2,3,18,25],squash:2,squeez:[11,18],srivastava:20,ss_exp_name_:30,stabil:[4,17,21],stabl:[0,17,18],stablest:2,stack:16,stage:20,stand:12,standalon:[16,23],standard:[0,2,16,17,18,20,21,22,23,25,27],stanford:20,stark:16,start:[0,2,3,14,15,16,17,18,20,21,22,23],start_step:[0,2,3],start_tim:27,starter:12,stat:20,state:[0,1,2,3,4,5,13,14,17,18,20,23,26,27,30],state_dict:27,statist:[4,20,27,28],stave:1,std:[2,9,12,16,20,27,28],stddev:[0,3],stdout:27,stdtest:27,steep:20,steeper:20,stein:15,step:[0,1,2,3,4,5,9,13,16,18,20,23,30],steps_per_epoch:[0,1,2,3,4,5,12,25,27,30],steve:15,still:[1,2,15,20,23],stochast:[0,1,2,4,5,15,17,18,20,26],stone:15,stook:15,stop:[1,4],stop_gradi:[11,12],store:[4,21,25,26,27,30],straightforward:17,strateg:15,strategi:[15,16,23],straw:15,streamlin:23,strength:[17,20],strictli:20,string:[24,25,26,27,30],strong:20,stronger:[20,21],strongest:20,strongli:18,structur:[15,17,21,23,24,26,30],stuck:[10,20],student:[7,18,22,23],studi:[7,15,16,18,20,23],stuff:22,style:[0,2,17],sub:17,subfold:25,subject:[2,16,20],submit:24,subprocess:30,subroutin:[0,17,20],subscript:16,subsect:[13,18],substanti:[3,10,16,17,20],substitut:[2,16],substr:[24,25],subtl:4,subtract:18,succe:[20,22],success:[12,16,30],successfulli:22,successor:2,sudo:22,suffici:4,suggest:[0,12,20],suit:9,suitabl:21,sum:[16,18],supersed:20,supervis:[15,18,20],supplementari:20,suppli:[23,25],support:[0,1,2,3,4,5,6,8,20,21,22,25,26],suppos:[0,1,16,18,24],sure:[1,4,5,20,23,25,26,27],surpris:15,surrog:[4,17],suspect:17,sutton:[5,15,17],swap:20,symbol:[0,1,2,3,4,5,12,16],sync:28,sync_all_param:28,synchron:20,system:[16,22],systemat:[15,20],szegedi:20,szepesvari:[15,17,20],tab:[26,27],tack:20,tailor:27,take:[0,1,2,3,4,5,12,16,17,18,20,23,25,30],taken:[16,17,18,28,30],talk:16,tang:15,tanh:[9,11,16,25,27],target:[0,2,3,11],target_kl:1,target_nois:3,task:[9,15,17,20,21,23,25],tau:18,taught:16,taxonomi:[8,15],taylor:[4,20],td3:[0,2,3,9,15,17,21,23],teach:[16,23],team:7,technic:16,techniqu:[17,20,21],technolog:23,tell:[0,1,16,20,24],temp:27,templat:21,tempor:15,ten:9,tend:[2,4,16,17],tensor:[11,16,18,26,27],tensorflow:[0,1,2,3,4,5,8,11,12,16,18,20,22,25,26,27],tensorshap:11,term:[0,1,2,3,4,5,12,13,16,18,20,23,30],termin:[0,26,28],terminolog:[8,17,20,23],terribl:17,test:[0,2,3,6,9,17,20,21,22,26,27],test_polici:[0,1,2,3,4,5,22,26],textbook:23,than:[1,3,12,16,17,18,20,24,25,30],thank:6,thei:[0,1,2,4,11,12,16,17,18,20,21,23,25,26,27],them:[0,12,15,16,17,18,20,21,23,24,25,26,27],theorem:[15,20],theoret:[0,4,15],theori:[0,5,8,18,20,21,23],thi:[0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,20,21,22,24,26,27,28,30],thin:[25,27],thing:[0,4,12,14,16,17,18,20,22,25,26,27],think:[1,2,6,12,13,15,16,17,20,23],thorough:[15,20],thoroughli:20,those:[9,12,16,17,18,20,24,25,26],though:[0,2,13,16,18,22,25],thought:[0,2,8],thousand:4,thread:20,three:[3,10,11,12,18,23,30],threshold:1,through:[2,4,16,17,18,20,21,22,23,24,25],throughout:[0,3,18,23,26,27],thu:[0,1,2,3,4,13,18],thunk:[25,30],tie:18,tied:20,tim:20,time:[0,2,3,12,13,14,15,16,17,20,22,23,25,27],timeless:5,timelimit:26,timestamp:[25,30],timestep:[2,9,14],tmp:[26,27],togeth:[0,3,11,17,18],toi:20,too:[0,1,4,20],tool:[0,1,2,3,4,5,8,16,23,25,26,27,30],top:[16,27],topic:[7,18,20,23],total:[11,22],totalenvinteract:24,totalgradientstep:27,toward:[0,1,2,3,15,22],tpro:9,trace:15,traceback:26,track:[22,27,30],trade:[2,17,21,23],tradeoff:2,trail:21,train:[0,1,2,3,4,5,8,9,10,15,16,17,18,20,21,22,25,27],train_mnist:27,train_one_epoch:18,train_op:[18,27],train_pi_it:1,train_v_it:[1,4,5,10,12],trajectori:[0,1,2,3,4,5,9,13,14,18,20,26],transfer:[8,17,20],transit:[0,16,17],translat:[18,20],transmut:23,trap:[1,4,5],treat:[0,11,13,16],treatment:[18,25],tree:[15,17],tri:[1,23],triad:17,trial:[15,16,22,23],trick:[0,1,2,3,17,18,20,23],tricki:20,trivial:[0,11],troubl:[20,22],trpo:[1,2,4,10,15,16,17,18,21],truli:15,trust:[8,15,17,21],truth:17,tsitsikli:[15,17],tucker:15,tune:[0,2,3,15,17,20],tupl:[0,16,18],turn:[1,11,17,18,20],turnaround:20,tutori:20,tweak:[18,20,26],twin:[8,17,21],two:[0,1,2,3,13,15,16,17,18,20,21,25,30],txt:[26,27],type:25,typic:[1,3,4,5,12,16,17,18,22,23,24,27],ucl:20,ultim:[18,21],unaccept:0,unbeat:15,unchang:3,unclear:16,uncommon:16,uncorrel:[0,3],under:[16,17,20,25],underli:[0,5,16],underscor:30,understand:[20,23],undiscount:[5,14,16,18],unfamiliar:[20,23],unifi:15,uniform:[0,2,3],unintent:15,uniqu:27,unit:[9,16,27],univers:15,unless:[20,24,25],unlik:17,unnecessari:20,unpack:26,unreal:15,unreason:20,unseri:30,unsolv:20,unstabl:[0,21],until:[5,21],untrain:10,upcom:23,updat:[0,1,2,3,4,5,9,17,18,20,21,22,23,28],ups:23,upsid:17,use:[0,1,2,3,4,5,9,12,13,16,17,18,20,21,22,23,24,25,26,27,28],use_soln:12,used:[0,1,2,3,4,5,9,12,16,17,18,21,22,23,24,25,26,27,30],useful:[0,3,12,15,16,17,18,20,21,22,25,30],useless:18,user:[8,23,25,30],user_config:[25,26,30],uses:[0,1,2,3,12,17,20,25],using:[0,1,2,3,4,5,12,13,14,15,16,17,18,20,21,23,25,26,27,30],usual:[0,1,2,3,4,16,17,18,20,27],util:[8,21,23,24,25,26,27],uvfa:15,vail:15,val:[27,30],valid:[0,2,3,16,18,20,25,30],valor:15,valu:[0,1,2,3,4,5,9,10,11,15,17,18,20,21,24,25,27,30],valuabl:20,value_1:25,value_2:25,van:[15,17],vandenbergh:4,vanilla:[4,8,18,20,21],var_list:28,vari:[2,24,25],variabl:[2,13,18,20,26,28],variable_scop:11,varianc:[18,20,25],variant:[0,1,17,20,27,30],variant_nam:30,variantgener:30,variat:[15,20],varieti:[0,3,15,16,20,21,23],variou:[6,12,20,21,25],vaswani:20,vector:[2,4,11,16,28],veloc:16,veri:[0,4,12,16,18,21,22],verifi:[2,20],version:[1,2,4,5,11,12,17,18,20,26,27],versu:12,vestigi:25,vezhnevet:15,vf_lr:[1,4,5],via:[0,1,5,12,13,15,16,17,21,24,27],vic:15,video:[20,22],view:[18,23],vime:15,virtual:23,vision:15,visit:4,visual:[16,20],vivo:12,volatil:3,vpg:[2,4,5,9,16,18,20,21],wai:[0,1,2,3,4,5,13,16,17,18,20,21,22,23,24,25,26,27,30],walker2d:[9,22,25],walker:25,wall:12,wang:15,want:[0,16,17,20,22,23,25,26,27,30],wari:20,warm:23,wasn:[21,26],watch:[20,22,25,26],wayn:15,weak:[17,20],weaker:[20,21],weber:[15,17],week:[20,23],weight:[2,15,16,18,20],weights_ph:18,weird:11,welcom:[16,23],well:[0,1,2,3,17,18,20,22,23,27],weng:[6,20],went:13,were:[0,3,4,6,9,20],what:[0,1,2,3,8,11,12,13,15,18,20,22,24,25,26,27],whatev:[16,20,27],whatsoev:18,wheel:20,when:[0,1,4,11,12,14,16,17,18,20,21,22,26,27,30],whenev:[4,16,20],where:[0,1,2,3,4,5,11,12,13,15,16,17,18,20,21,23,26,30],wherea:11,wherev:16,whether:[0,16,17,30],which:[0,1,2,3,4,5,11,12,14,15,16,17,18,20,21,22,23,24,25,26,27,30],whichev:[3,16],who:[6,20,23],whole:[0,1,2,3,4,5,11,20],whose:[16,17,18,27,30],why:[2,8,13,16,17,22],wide:[0,3,15,16,20,23,24],widespread:18,willing:17,win:20,wind:2,window:[16,17,22,24],wipe:27,wish:0,with_min_and_max:[27,28],within:12,without:[1,4,9,12,15,18,20,23,28],won:[16,18,22],wonder:21,word:16,work:[0,2,3,4,5,7,12,13,17,18,20,21,22,23,24],workaround:22,worker:[27,28],world:[8,16,17],worri:[13,22],wors:18,worth:[0,1,4,15,20,22],worthwhil:[18,20],would:[0,3,4,11,12,16,17,18,20,23,27],wouldn:[20,25],wrap:[20,26,30],wrapper:[25,26],write:[13,16,17,20,22,25,27],writer:15,written:[11,20,27],wrong:[12,18,20],wulfmeier:15,x_ph:[0,1,2,3,4,5,27],x_train:27,xaxi:24,y_ph:27,y_train:27,yang:6,year:[20,22,23],yet:[2,20,23],yield:[4,20],you:[5,6,8,11,12,15,17,20,21,23,28,30],younger:21,your:[0,8,12,16,20,26,27],your_env:26,yourself:20,zeigler:6,zero:[0,3,4,13,18],zeros_lik:18,ziebart:15,zokhov:6,zshell:25},titles:["Deep Deterministic Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Twin Delayed DDPG","Trust Region Policy Optimization","Vanilla Policy Gradient","Acknowledgements","About the Author","Welcome to Spinning Up in Deep RL!","Benchmarks for Spinning Up Implementations","Solution to Exercise 2.1","Solution to Exercise 2.2","Exercises","Extra Material","Extra Material","Key Papers in Deep RL","Part 1: Key Concepts in RL","Part 2: Kinds of RL Algorithms","Part 3: Intro to Policy Optimization","Limitations and Frontiers","Spinning Up as a Deep RL Researcher","Algorithms","Installation","Introduction","Plotting Results","Running Experiments","Experiment Outputs","Logger","MPI Tools","Plotter","Run Utils"],titleterms:{"class":27,"function":[12,14,16,26],"long":19,"public":[0,1,2,3,4,5],"return":16,Doing:20,Not:26,One:25,The:[0,11,16,20,21],These:[0,1,4,5,21],Using:[14,25,26,27],about:7,acknowledg:6,action:[15,16],actor:2,actual:25,advantag:16,algorithm:[12,15,17,21,26],analysi:15,ant:9,author:7,background:[0,1,2,3,4,5,20],base:[15,17],baselin:[15,18],basic:[12,18],bellman:16,benchmark:9,bonu:15,bug:[11,12],built:23,call:30,can:16,categor:16,caution:[],challeng:12,check:22,classic:15,close:20,code:[11,12,21,23],combin:15,command:25,complex:19,comput:12,concept:16,config:25,consist:15,content:[0,1,2,3,4,5,9,12,15,16,17,18,20,21,22,23,25,26,27,28,30],core:[21,28],critic:2,critiqu:15,ddpg:[0,3,12],deep:[0,8,15,20],delai:3,depend:15,deriv:18,design:[19,23],detail:[9,25],determin:25,determinist:[0,15,16],develop:20,diagon:16,direct:15,directori:26,distract:[13,18],distribut:15,document:[0,1,2,3,4,5],don:[13,18,25],each:9,entropi:2,environ:[9,25,26],equat:[0,1,2,3,4,5,16],error:26,evolutionari:15,exampl:27,exercis:[10,11,12],expect:18,experi:[9,25,26,30],experimentgrid:[25,30],exploit:[0,1,2,3,4,5],explor:[0,1,2,3,4,5,15],extra:[13,14,25],fact:[0,1,2,3,4,5],failur:12,file:21,fit:12,flag:25,form:18,formal:16,format:21,formula:14,found:26,free:[15,17],from:[12,25],frontier:19,gaussian:[12,16],given:15,grad:18,gradient:[0,5,14,15,18],graph:[12,27],guid:25,gum:11,gym:[],halfcheetah:9,hierarchi:15,hopper:9,horizon:19,how:[11,23,25],hyperparamet:25,imit:15,implement:[0,1,2,3,4,5,9,12,18],includ:21,indic:8,instal:22,intrins:15,intro:18,introduct:23,invers:15,kei:[0,1,2,3,4,5,15,16],kind:17,know:[0,1,2,3,4,16,18,22,24,25,26,27],launch:25,learn:[0,2,15,17,20],lemma:18,let:[13,18],likelihood:12,limit:19,line:25,link:17,load:[26,27],locat:26,log:[12,18,27],logger:27,mac:22,materi:[13,14],memori:15,meta:15,mission:23,mode:12,model:[0,1,2,3,4,5,15,17],motiv:15,mpi:[27,28],mujoco:22,multipl:25,multitask:15,need:25,observ:16,off:21,onc:25,openai:[],openmpi:22,optim:[1,4,16,18],option:[16,22],other:[0,1,2,3,4,5,15,18,20],our:23,output:26,paper:[0,1,2,3,4,5,15],part:[16,17,18],past:[13,18],path:15,perform:9,philosophi:23,plan:23,plot:24,plotter:29,polici:[0,1,4,5,12,14,15,16,18,21,26],ppo:12,prob:18,problem:[12,16],project:20,proof:[13,14,18],proxim:1,pseudocod:[0,1,2,3,4,5],python:22,quick:[0,1,2,3,4,5],quickstart:25,real:15,recap:18,refer:[0,1,2,3,4,5,20],region:4,regular:2,reinforc:[2,15],relev:[0,1,2,3,4,5],reproduc:15,request:12,research:[12,20],resourc:20,result:[24,25],review:15,reward:[16,18,19],right:20,rigor:20,run:[25,26,30],safeti:15,sampl:19,save:[0,1,2,3,4,5,25,26,27],scale:15,scratch:12,script:25,serv:23,set:[12,25],shortcut:25,should:[0,1,2,3,4,16,18,22,24,25,26,27],side:0,silent:12,simplest:18,soft:2,solut:[10,11],space:16,special:25,spin:[8,9,20,22],state:16,stochast:16,successfulli:26,suffix:25,support:23,swimmer:9,tabl:[0,1,2,3,4,5,8,9,12,15,16,17,18,20,21,22,23,25,26,27,28,30],task:19,taxonomi:17,td3:12,tensorflow:28,terminolog:16,theori:15,thi:[23,25],thought:20,tool:28,train:26,trajectori:16,transfer:15,trpo:12,trust:4,twin:3,ubuntu:22,unsupervis:15,util:[28,30],valu:[12,16,26],vanilla:5,walker:9,warn:[],welcom:8,what:[16,17,21,23],where:25,why:[0,1,4,5,21,23],work:11,world:15,write:12,you:[0,1,2,3,4,13,16,18,22,24,25,26,27],your:22}})
\ No newline at end of file
diff --git a/docs/_build/html/spinningup/keypapers.html b/docs/_build/html/spinningup/keypapers.html
index 2b2335916..32c07de38 100644
--- a/docs/_build/html/spinningup/keypapers.html
+++ b/docs/_build/html/spinningup/keypapers.html
@@ -659,7 +659,7 @@ a. Model is Learned
-[61] Model-Based Value Estimation for Efficient Model-Free Reinforcement Learning, Feinberg et al, 2018. Algorithm: MBVE.
+[61] Model-Based Value Expansion for Efficient Model-Free Reinforcement Learning, Feinberg et al, 2018. Algorithm: MVE.