forked from suragnair/alpha-zero-general
-
Notifications
You must be signed in to change notification settings - Fork 3
/
MCTS.py
117 lines (92 loc) · 4.04 KB
/
MCTS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import math
import numpy as np
class MCTS():
"""
This class handles the MCTS tree.
"""
def __init__(self, game, nnet, args):
self.game = game
self.nnet = nnet
self.args = args
self.Qsa = {} # stores Q values for s,a (as defined in the paper)
self.Nsa = {} # stores #times edge s,a was visited
self.Ns = {} # stores #times board s was visited
self.Ps = {} # stores initial policy (returned by neural net)
self.Es = {} # stores game.getGameEnded ended for board s
self.Vs = {} # stores game.getValidMoves for board s
def getActionProb(self, canonicalBoard, temp=1):
"""
This function performs numMCTSSims simulations of MCTS starting from
canonicalBoard.
Returns:
probs: a policy vector where the probability of the ith action is
proportional to Nsa[(s,a)]**(1./temp)
"""
for i in range(self.args.numMCTSSims):
self.search(canonicalBoard)
s = self.game.stringRepresentation(canonicalBoard)
counts = [self.Nsa[(s,a)] if (s,a) in self.Nsa else 0 for a in range(self.game.getActionSize())]
if temp==0:
bestA = np.argmax(counts)
probs = [0]*len(counts)
probs[bestA]=1
return probs
counts = [x**(1./temp) for x in counts]
probs = [x/float(sum(counts)) for x in counts]
return probs
def search(self, canonicalBoard):
"""
This function performs one iteration of MCTS. It is recursively called
till a leaf node is found. The action chosen at each node is one that
has the maximum upper confidence bound as in the paper.
Once a leaf node is found, the neural network is called to return an
initial policy P and a value v for the state. This value is propogated
up the search path. In case the leaf node is a terminal state, the
outcome is propogated up the search path. The values of Ns, Nsa, Qsa are
updated.
NOTE: the return values are the negative of the value of the current
state. This is done since v is in [-1,1] and if v is the value of a
state for the current player, then its value is -v for the other player.
Returns:
v: the negative of the value of the current canonicalBoard
"""
s = self.game.stringRepresentation(canonicalBoard)
if s not in self.Es:
self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
if self.Es[s]!=0:
# terminal node
return -self.Es[s]
if s not in self.Ps:
# leaf node
self.Ps[s], v = self.nnet.predict(canonicalBoard)
valids = self.game.getValidMoves(canonicalBoard, 1)
self.Ps[s] = self.Ps[s]*valids # masking invalid moves
self.Ps[s] /= np.sum(self.Ps[s]) # renormalize
self.Vs[s] = valids
self.Ns[s] = 0
return -v
valids = self.Vs[s]
cur_best = -float('inf')
best_act = -1
# pick the action with the highest upper confidence bound
for a in range(self.game.getActionSize()):
if valids[a]:
if (s,a) in self.Qsa:
u = self.Qsa[(s,a)] + self.args.cpuct*self.Ps[s][a]*math.sqrt(self.Ns[s])/(1+self.Nsa[(s,a)])
else:
u = self.args.cpuct*self.Ps[s][a]*math.sqrt(self.Ns[s]) # Q = 0 ?
if u > cur_best:
cur_best = u
best_act = a
a = best_act
next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
next_s = self.game.getCanonicalForm(next_s, next_player)
v = self.search(next_s)
if (s,a) in self.Qsa:
self.Qsa[(s,a)] = (self.Nsa[(s,a)]*self.Qsa[(s,a)] + v)/(self.Nsa[(s,a)]+1)
self.Nsa[(s,a)] += 1
else:
self.Qsa[(s,a)] = v
self.Nsa[(s,a)] = 1
self.Ns[s] += 1
return -v