This repository has been archived by the owner on Oct 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoptimisation.py
125 lines (87 loc) · 3.52 KB
/
optimisation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import warnings
from rlagents.functions.decay import DecayBase, FixedDecay
class OptimiserBase(object):
def __init__(self, model=None, memory=None):
self.model = model
self.memory = memory
def _is_valid(self):
return self.model is not None and self.memory is not None
def run(self):
raise NotImplementedError
def configure(self, model, memory):
self.model = model
self.memory = memory
def export(self):
raise NotImplementedError
class DefaultOptimiser(OptimiserBase):
def run(self):
pass
def export(self):
return {"Type": "Default"}
class MonteCarlo(OptimiserBase):
"""
Monte Carlo optimisation involves updating all state action values
in an episodes history towards the cumulative reward at the terminal state
"""
def __init__(self, model=None, memory=None, discount=0.95, learning_rate=None):
OptimiserBase.__init__(self, model, memory)
self.discount = discount
self.learning_rate = learning_rate
@property
def learning_rate(self):
return self._learning_rate
@learning_rate.setter
def learning_rate(self, lr):
if not isinstance(lr, DecayBase):
lr = FixedDecay(1, decay=0.995, minimum=0.05)
warnings.warn('Learning Rate type invalid, using default. ({0})'.format(lr))
self._learning_rate = lr
def run(self):
assert self._is_valid()
terminal_state = self.memory.fetch_last(1)
if not terminal_state['done'][0]:
return
m = self.memory.fetch_episode(1)
reward = m['rewards'].sum()
for i in range(len(m)):
self.model.update(m.iloc[i]['observations'], m.iloc[i]['actions'], reward)
self.learning_rate.update()
def export(self):
return {"Type": "Temporal Difference",
"Discount": self.discount,
"Learning Rate": self.learning_rate.export()}
class TemporalDifference(OptimiserBase):
"""
Temporal Difference involves updating a state-action pair based on the
perceived reward, and the expected future reward of the next state
"""
def __init__(self, model=None, memory=None, discount=0.95, learning_rate=None):
OptimiserBase.__init__(self, model, memory)
self.discount = discount
self.learning_rate = learning_rate
@property
def learning_rate(self):
return self._learning_rate
@learning_rate.setter
def learning_rate(self, lr):
if not isinstance(lr, DecayBase):
lr = FixedDecay(1, decay=0.995, minimum=0.05)
warnings.warn('Learning Rate type invalid, using default. ({0})'.format(lr))
self._learning_rate = lr
def run(self):
assert self._is_valid()
m = self.memory.fetch_last(1)
observation = m['new_obs'][0]
done = m['done'][0]
reward = m['rewards'][0]
prev_obs = m['observations'][0]
prev_action = m['actions'][0]
future = self.model.state_value(observation) if not done else 0.0
new_val = self.model.state_action_value(prev_obs, prev_action) + self.learning_rate.value * (reward + self.discount * future - self.model.state_action_value(prev_obs, prev_action))
self.model.update(prev_obs, prev_action, new_val)
if done:
self.learning_rate.update()
def export(self):
return {"Type": "Temporal Difference",
"Discount": self.discount,
"Learning Rate": self.learning_rate.export()}