-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_q_wot.py
140 lines (108 loc) · 5.32 KB
/
test_q_wot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from policy import Policy, RandomPolicy, GreedyShooterRandomPolicy
from battle import Battle
from tank import Tank
from enum import Enum
from q_wot import q_learn_1v1
import sys
from time import time
class Result(Enum):
TEAM0_WIN = 1
DRAW = 0
TEAM1_WIN = -1
def simulate_1v1(p0_policy: Policy, p1_policy: Policy, verbose: bool = False) -> Result:
b = Battle([Tank()], [Tank()])
p0_policy = p0_policy(b) # type: ignore
p1_policy = p1_policy(b) # type: ignore
if verbose:
print(f"Ticks left: {b.ticks_left}")
while(not b.battle_is_over()):
team0_actions, team1_actions = b.generate_all_player_actions()
player0_actions = team0_actions[0]
player1_actions = team1_actions[0]
b.apply_all_player_actions(([p0_policy.choose_action(0,0,player0_actions)], [p1_policy.choose_action(1,0,player1_actions)]), verbose)
if verbose:
print(f"\nGAME OVER. Ticks left: {b.ticks_left}")
result: Result = Result.DRAW
if b.win(0):
if verbose:
print("Win for team 0.")
result = Result.TEAM0_WIN
elif b.win(1):
if verbose:
print("Win for team 1.")
result = Result.TEAM1_WIN
else:
if verbose:
print("Draw.")
if verbose:
b.print_all_tank_stats()
return result
def simulate_n_battles(p0_policy: Policy, p1_policy: Policy, num_battles: int = 10000, verbose: bool = True) -> tuple[int, int, int]:
wins, draws, losses = 0, 0, 0
for i in range(num_battles):
result: Result = simulate_1v1(p0_policy, p1_policy) # type: ignore
if result == Result.DRAW:
draws += 1
elif result == Result.TEAM0_WIN:
wins += 1
else:
losses += 1
if verbose:
print(f"Out of {num_battles} battles, team 0 had:\n{wins} wins\n{draws} draws\n{losses} losses")
return wins, draws, losses
if (__name__ == "__main__"):
if "--get-baselines" in sys.argv:
print("Computes the baseline results of playing the greedy & random agents against each other or themselves for 10k games.")
if "--random-v-random" in sys.argv:
policy0, policy1 = RandomPolicy, RandomPolicy
elif "--greedy-v-random" in sys.argv:
policy0, policy1 = GreedyShooterRandomPolicy, RandomPolicy
elif "--greedy-v-greedy" in sys.argv:
policy0, policy1 = GreedyShooterRandomPolicy, GreedyShooterRandomPolicy
else:
print("When using the --get-baselines command, please also add --random-v-random, --greedy-v-random, or --greedy-v-greedy")
exit(1)
simulate_n_battles(policy0, policy1, 10000) # type: ignore
elif "--train-q-learning" in sys.argv:
print("This option trains the q-learning agent for 100k battles against the given agent, then simulates 10k battles.")
if "--q-v-random" in sys.argv:
policy1_str = "random"
policy1 = RandomPolicy
elif "--q-v-greedy" in sys.argv:
policy1_str = "greedy"
policy1 = GreedyShooterRandomPolicy
else:
print("When using the --train-qlearning command, please also add --q-v-random or --q-v-greedy")
exit(1)
start: float = time()
if "--use-pretrained" in sys.argv:
print("Using pre-trained q-values.")
policy0 = q_learn_1v1(policy1_str, trained_filename=f"trained_qvalues/trained_{policy1_str}_example.pickle")
else:
print("Beginning q-learning training.")
policy0 = q_learn_1v1(policy1_str, 100000)
print(f"Completed training. Time elapsed: {round(((time() - start) / 60), 2)} minutes.")
print("Beginning battle simulations.")
simulate_n_battles(policy0, policy1, 10000) # type: ignore
print(f"Total training & simulation time: {round(((time() - start) / 60), 2)} minutes.")
elif "--parameter-tuning" in sys.argv:
"""
Used to iterate over a bunch of combinations and see what gives the best results
for changing epsilon, initial learning rate, and discount factor.
Each simulation takes a little under 4 1/2 minutes, so can test 13 combinations an hour.
"""
for epsilon in [0.1, 0.2, 0.3]:
for discount_factor in [0.2, 0.4, 0.6]:
for learning_rate in [0.1, 0.2, 0.3]:
for i in range(5):
policy0 = q_learn_1v1("greedy", 100000, epsilon, discount_factor, learning_rate)
wins, draws, losses = simulate_n_battles(policy0, GreedyShooterRandomPolicy, 10000, False) # type: ignore
# since it takes about 5 minutes per simulation, and the whole script takes 10+ hours,
# probably should only open & close the file when the new data is ready
f = open("results.txt", "at")
f.write(f"epsilon: {epsilon}%, discount factor: {discount_factor}%, learning rate: {learning_rate}% | W: {wins}, D: {draws}, L: {losses} \n")
print(f"epsilon: {epsilon}%, discount factor: {discount_factor}%, learning rate: {learning_rate}% | W: {wins}, D: {draws}, L: {losses}")
f.close()
else:
print("Please use the --get-baselines or --train-qlearning options.")
exit(1)