-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlake-dp-policy-iteration.py
48 lines (35 loc) · 1005 Bytes
/
lake-dp-policy-iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import gymnasium as gym
env = gym.make(
"FrozenLake-v1",
desc=["FFFF", "SHFH", "FFFH", "HFFG"],
is_slippery=False,
render_mode="human",
)
observation, info = env.reset()
current_row, current_col = divmod(observation, 4)
print(f"{observation=}")
print(f"{current_row=}")
print(f"{current_col=}")
env.render()
VS = dict()
for i in range(16):
VS[i] = 0
print(f"{VS=}")
# ReturnsS = dict()
best_apisode = None
best_episode_G = None
def generate_episode():
episode = []
while True:
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
episode.append({"state": observation, "action": action, "reward": reward})
if terminated or truncated:
observation, info = env.reset()
break
return episode
for _ in range(1000):
episode = generate_episode()
for step in reversed(episode):
v = VS[step["state"]]
VS[step["state"]] = 0 # Sum_a (p(a|s))