-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq_wot.py
206 lines (142 loc) · 8.19 KB
/
q_wot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
from policy import Policy, RandomPolicy, GreedyShooterRandomPolicy
from battle import Action, ActionType, Battle
from tank import Tank, TankState
from random import shuffle, random, choice
from itertools import repeat
import pickle
from datetime import datetime
from typing import Union
# (in_light_cover, in_heavy_cover, ready_to_shoot, moving, possible_target, enemy_moving, enemy_light_cover)
State = tuple[bool, bool, bool, bool, bool, bool, bool]
Q = tuple[State, ActionType]
def get_weight_and_learning_rate(weights: dict[Q, tuple[float, float]], state: State, action_type: ActionType, INITIAL_LEARNING_RATE: float) -> tuple[float, float]:
if (state, action_type) in weights:
return weights[(state, action_type)]
else:
weights[(state, action_type)] = 0.0, INITIAL_LEARNING_RATE
return 0.0, INITIAL_LEARNING_RATE
def set_weight(weights: dict[Q, tuple[float, float]], state: State, action_type: ActionType, new_weight: float) -> None:
# ALSO DECAYS LEARNING RATE
_, learning_rate = weights[(state, action_type)]
weights[(state, action_type)] = new_weight, learning_rate ** 0.99
def best_action_type(weights: dict[Q, tuple[float, float]], state: State, allowed_action_types: list[ActionType], INITIAL_LEARNING_RATE: float) -> ActionType:
shuffle(allowed_action_types)
best_found_action_type: ActionType = allowed_action_types[0]
best_found_reward: float = 0.0
for action_type in allowed_action_types:
reward, _ = get_weight_and_learning_rate(weights, state, action_type, INITIAL_LEARNING_RATE)
if reward > best_found_reward:
best_found_reward = reward
best_found_action_type = action_type
return best_found_action_type
def best_action_value(weights: dict[Q, tuple[float, float]], state: State, allowed_action_types: list[ActionType], INITIAL_LEARNING_RATE: float) -> float:
best_found_reward: float = 0.0
for action_type in allowed_action_types:
reward, _ = get_weight_and_learning_rate(weights, state, action_type, INITIAL_LEARNING_RATE)
if reward > best_found_reward:
best_found_reward = reward
return best_found_reward
def epsilon_greedy(weights: dict[Q, tuple[float, float]], state: State, allowed_action_types: list[ActionType], EPSILON: float, INITIAL_LEARNING_RATE: float) -> ActionType:
if random() <= EPSILON:
return choice(allowed_action_types)
else:
return best_action_type(weights, state, allowed_action_types, INITIAL_LEARNING_RATE)
def choose_qaction(weights: dict[Q, tuple[float, float]], state: State, actions: list[Action], EPSILON: float, INITIAL_LEARNING_RATE: float) -> Action:
allowed_action_types: list[ActionType] = list(set(map(lambda x: x[0], actions)))
chosen_action_type: ActionType = epsilon_greedy(weights, state, allowed_action_types, EPSILON, INITIAL_LEARNING_RATE)
return choice(list(filter(lambda x: x[0] == chosen_action_type, actions)))
def choose_best_qaction(weights: dict[Q, tuple[float, float]], state: State, actions: list[Action], INITIAL_LEARNING_RATE: float) -> Action:
allowed_action_types: list[ActionType] = list(set(map(lambda x: x[0], actions)))
chosen_action_type: ActionType = best_action_type(weights, state, allowed_action_types, INITIAL_LEARNING_RATE)
return choice(list(filter(lambda x: x[0] == chosen_action_type, actions)))
def compute_state_from_tank_state(battle: Battle, team: int, player: int, player_state: TankState) -> State:
# I am realizing too late that this will not scale for more than one enemy lol
in_light_cover, in_heavy_cover, ready_to_shoot, moving, possible_target, enemy_moving, enemy_light_cover = False, False, False, False, False, False, False
enemy_team: int = 1 if team == 0 else 0
possible_targets = battle.possible_targets(team, player)
if len(possible_targets) > 0:
possible_target = True
# just going to grab the first one - doesn't work for more than one enemy
target_state: TankState = battle.team_states[enemy_team][possible_targets[0]]
if target_state.moving():
enemy_moving = True
if target_state.in_light_cover():
enemy_light_cover = True
if player_state.in_light_cover():
in_light_cover = True
elif player_state.in_heavy_cover():
in_heavy_cover = True
if player_state.ready_to_shoot():
ready_to_shoot = True
if player_state.moving():
moving = True
return in_light_cover, in_heavy_cover, ready_to_shoot, moving, possible_target, enemy_moving, enemy_light_cover
def q_learn_1v1(enemy_strategy: str, num_simulations: int = 1000, epsilon: float = 0.1, discount: float = 0.4, l_rate: float = 0.3, trained_filename: Union[str,None] = None) -> Policy:
# exploration value
EPSILON = epsilon
DISCOUNT_FACTOR = discount
INITIAL_LEARNING_RATE = l_rate
# if pickled_weights:
# raise NotImplementedError(
# "Hold on pal, we haven't built the pickle stuff yet.")
"""
what should my functional approximators be?
- in light cover
- in heavy cover
- ready to fire
- moving
- enemy moving
- enemy in light cover
actions should probably just be their type:
- move
- move and shoot
- shoot
- do nothing
this gives 64 buckets with the 4 action types each
but not every bucket is necessarily valid, so will be less.
since it's sparse can use a dictionary instead of a many-dimensional array
"""
if not trained_filename:
weights: dict[Q, tuple[float, float]] = {}
# simulate num_simulations battles against the enemy policy
for _ in repeat(None, num_simulations):
b = Battle([Tank()], [Tank()])
if enemy_strategy == "greedy":
enemy_policy = GreedyShooterRandomPolicy(b)
elif enemy_strategy == "random":
enemy_policy = RandomPolicy(b)
else:
raise ValueError("Invalid policy: use 'greedy' or 'random'.")
while (not b.battle_is_over()):
team0_actions, team1_actions = b.generate_all_player_actions()
player0_actions = team0_actions[0]
player1_actions = team1_actions[0]
player_state: TankState = b.team_states[0][0]
# Choose & apply action
state = compute_state_from_tank_state(b, 0, 0, player_state)
action = choose_qaction(weights, state, player0_actions, EPSILON, INITIAL_LEARNING_RATE)
b.apply_all_player_actions(([action], [enemy_policy.choose_action(1,0,player1_actions)]))
# Observe result
damage_dealt_and_avoided, damage_taken = player_state.last_damage_stats
reward: int = damage_dealt_and_avoided - damage_taken
# Update Q-values
value, learning_rate = get_weight_and_learning_rate(weights, state, action[0], INITIAL_LEARNING_RATE)
next_state = compute_state_from_tank_state(b, 0, 0, player_state)
next_state_value = best_action_value(weights, next_state, list(set(map(lambda x: x[0], b.generate_all_player_actions()[0][0]))), INITIAL_LEARNING_RATE)
new_weight: float = (1 - learning_rate) * value + learning_rate * (reward + DISCOUNT_FACTOR * next_state_value)
set_weight(weights, state, action[0], new_weight)
filename: str = f"trained_qvalues/{str(datetime.now())} e-{EPSILON} d-{DISCOUNT_FACTOR} lr-{INITIAL_LEARNING_RATE}.pickle"
with open(filename, "wb") as outfile:
pickle.dump(weights, outfile)
outfile.close()
else:
# we were given a trained dataset to load
with open(trained_filename, "rb") as infile:
weights = pickle.load(infile)
infile.close()
class QPolicy(Policy):
def choose_action(self, team: int, player: int, actions: list[Action]) -> Action:
player_state: TankState = self.battle.team_states[team][player]
state = compute_state_from_tank_state(self.battle, team, player, player_state)
return choose_best_qaction(weights, state, actions, INITIAL_LEARNING_RATE)
return QPolicy # type: ignore