-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoSim.py
382 lines (310 loc) · 14.9 KB
/
goSim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
from gym import error
try:
import pachi_py
except ImportError as e:
# The dependency group [pachi] should match the name is setup.py.
raise error.DependencyNotInstalled('{}. (HINT: you may need to install the Go dependencies via "pip install gym[pachi]".)'.format(e))
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding
from six import StringIO
import sys
import six
from copy import copy
# The coordinate representation of Pachi (and pachi_py) is defined on a board
# with extra rows and columns on the margin of the board, so positions on the board
# are not numbers in [0, board_size**2) as one would expect. For this Go env, we instead
# use an action representation that does fall in this more natural range.
def _pass_action(board_size):
return board_size**2
def _resign_action(board_size):
return board_size**2 + 1
def _coord_to_action(board, c):
'''Converts Pachi coordinates to actions'''
if c == pachi_py.PASS_COORD: return _pass_action(board.size)
if c == pachi_py.RESIGN_COORD: return _resign_action(board.size)
i, j = board.coord_to_ij(c)
return i*board.size + j
def _action_to_coord(board, a):
'''Converts actions to Pachi coordinates'''
if a == _pass_action(board.size): return pachi_py.PASS_COORD
if a == _resign_action(board.size): return pachi_py.RESIGN_COORD
return board.ij_to_coord(a // board.size, a % board.size)
def str_to_action(board, s):
return _coord_to_action(board, board.str_to_coord(s.encode()))
class GoState(object):
'''
Go game state. Consists of a current player and a board.
Actions are exposed as integers in [0, num_actions), which is different
from Pachi's internal "coord_t" encoding.
'''
def __init__(self, board, color):
'''
Args:
board: current board
color: color of current player
'''
assert color in [pachi_py.BLACK, pachi_py.WHITE], 'Invalid player color'
self.board, self.color = board, color
def act(self, action):
'''
Executes an action for the current player
Returns:
a new GoState with the new board and the player switched
'''
return GoState(
self.board.play(_action_to_coord(self.board, action), self.color),
pachi_py.stone_other(self.color))
def __repr__(self):
return 'To play: {}\n{}'.format(six.u(pachi_py.color_to_str(self.color)), self.board.__repr__().decode())
### Adversary policies ###
def make_random_policy(np_random):
def random_policy(curr_state, prev_state, prev_action):
b = curr_state.board
legal_coords = b.get_legal_coords(curr_state.color)
return _coord_to_action(b, np_random.choice(legal_coords))
return random_policy
def make_pachi_policy(board, engine_type='uct', threads=1, pachi_timestr=''):
engine = pachi_py.PyPachiEngine(board, engine_type, six.b('threads=%d' % threads))
def pachi_policy(curr_state, prev_state, prev_action):
if prev_state is not None:
assert engine.curr_board == prev_state.board, 'Engine internal board is inconsistent with provided board. The Pachi engine must be called consistently as the game progresses.'
prev_coord = _action_to_coord(prev_state.board, prev_action)
engine.notify(prev_coord, prev_state.color)
engine.curr_board.play_inplace(prev_coord, prev_state.color)
out_coord = engine.genmove(curr_state.color, pachi_timestr)
out_action = _coord_to_action(curr_state.board, out_coord)
engine.curr_board.play_inplace(out_coord, curr_state.color)
return out_action
return pachi_policy
def _play(black_policy_fn, white_policy_fn, board_size=19):
'''
Samples a trajectory for two player policies.
Args:
black_policy_fn, white_policy_fn: functions that maps a GoState to a move coord (int)
'''
moves = []
prev_state, prev_action = None, None
curr_state = GoState(pachi_py.CreateBoard(board_size), BLACK)
while not curr_state.board.is_terminal:
a = (black_policy_fn if curr_state.color == BLACK else white_policy_fn)(curr_state, prev_state, prev_action)
next_state = curr_state.act(a)
moves.append((curr_state, a, next_state))
prev_state, prev_action = curr_state, a
curr_state = next_state
return moves
class GoEnv(gym.Env):
'''
Go environment. Play against a fixed opponent.
'''
metadata = {"render.modes": ["human", "ansi"]}
def __init__(self, player_color, observation_type, illegal_move_mode, board_size, komi):
"""
Args:
player_color: Stone color for the agent. Either 'black' or 'white'
opponent: An opponent policy
observation_type: State encoding
illegal_move_mode: What to do when the agent makes an illegal move. Choices: 'raise' or 'lose'
"""
assert isinstance(board_size, int) and board_size >= 1, 'Invalid board size: {}'.format(board_size)
self.board_size = board_size
self.komi = komi
self._seed()
colormap = {
'black': pachi_py.BLACK,
'white': pachi_py.WHITE,
}
try:
self.player_color = colormap[player_color]
except KeyError:
raise error.Error("player_color must be 'black' or 'white', not {}".format(player_color))
# self.opponent_policy = None
# self.opponent = opponent
assert observation_type in ['image3c']
self.observation_type = observation_type
assert illegal_move_mode in ['lose', 'raise']
self.illegal_move_mode = illegal_move_mode
if self.observation_type != 'image3c':
raise error.Error('Unsupported observation type: {}'.format(self.observation_type))
shape = pachi_py.CreateBoard(self.board_size).encode().shape
self.observation_space = spaces.Box(np.zeros(shape), np.ones(shape))
# One action for each board position, pass, and resign
self.action_space = spaces.Discrete(self.board_size**2 + 2)
# Filled in by _reset()
self.state = None
self.done = True
# Modifications
self.last_player_passed = False
self.BLACK = np.array([1,0,0])
self.WHITE = np.array([0,1,0])
self.EMPTY = np.array([0,0,1])
def _seed(self, seed=None):
self.np_random, seed1 = seeding.np_random(seed)
# Derive a random seed.
seed2 = seeding.hash_seed(seed1 + 1) % 2**32
pachi_py.pachi_srand(seed2)
return [seed1, seed2]
def reset(self):
self.state = GoState(pachi_py.CreateBoard(self.board_size), pachi_py.BLACK)
# (re-initialize) the opponent
# necessary because a pachi engine is attached to a game via internal data in a board
# so with a fresh game, we need a fresh engine
# self._reset_opponent(self.state.board)
# Let the opponent play if it's not the agent's turn
opponent_resigned = False
if self.state.color != self.player_color:
self.state, opponent_resigned, _ = self._exec_opponent_play(self.state, None, None)
# We should be back to the agent color
assert self.state.color == self.player_color
self.last_player_passed = False
self.done = self.state.board.is_terminal
return self.state.board.encode()
def close(self):
self.opponent_policy = None
self.state = None
self.last_player_passed = False
def render(self, mode="human", close=False):
if close:
return
outfile = StringIO() if mode == 'ansi' else sys.stdout
outfile.write(repr(self.state) + '\n')
return outfile
def decide_winner(self):
current_score = self.state.board.official_score + self.komi
white_wins = current_score > 0
black_wins = current_score < 0
player_wins = (white_wins and self.player_color == pachi_py.WHITE) or (black_wins and self.player_color == pachi_py.BLACK)
reward = 1. if player_wins else -1. if (white_wins or black_wins) else 0.
return self.state.board.encode(), _pass_action(self.board_size), reward, True, {'state': self.state}, current_score
# Game terminates
# return obs_t, r_t, done, info, cur_score
def step(self, action):
assert self.state.color == self.player_color
current_score = self.state.board.official_score + self.komi
if self.last_player_passed and action == _pass_action(self.board_size):
self.done = True
return self.decide_winner()
# If already terminal, then don't do anything
if self.done:
return self.state.board.encode(), action, 0., True, {'state': self.state}, current_score
# If resigned, then we're done
if action == _resign_action(self.board_size):
self.done = True
return self.state.board.encode(), action, -1., True, {'state': self.state}, current_score
pass_flag = False
# Play
try:
# print(action)
self.state = self.state.act(action)
self.last_player_passed = True if action == _pass_action(self.board_size) else False
current_score = self.state.board.official_score + self.komi
except pachi_py.IllegalMove:
if self.illegal_move_mode == 'raise':
# Consecutive Passes
if self.last_player_passed:
self.done = True
return self.decide_winner()
else:
self.last_player_passed = True
self.state = self.state.act(_pass_action(self.board_size))
current_score = self.state.board.official_score + self.komi
pass_flag = True
print("illegal_move, considered as Pass")
# six.reraise(*sys.exc_info())
elif self.illegal_move_mode == 'lose':
# Automatic loss on illegal move
self.done = True
return self.state.board.encode(), _pass_action(self.board_size), -1., True, {'state': self.state}, current_score
else:
raise error.Error('Unsupported illegal move action: {}'.format(self.illegal_move_mode))
# # Opponent play
# if not self.state.board.is_terminal:
# self.state, opponent_resigned, both_passed = self._exec_opponent_play(self.state, prev_state, action)
#
# if both_passed:
# self.last_player_passed = True
# self.done = True
# return self.decide_winner()
# # After opponent play, we should be back to the original color
# assert self.state.color == self.player_color
# current_score = self.state.board.official_score + self.komi
#
# # If the opponent resigns, then the agent wins
# if opponent_resigned:
# self.done = True
# return self.state.board.encode(), 1., True, {'state': self.state}, current_score
# Reward: if nonterminal, then the reward is 0
if not self.state.board.is_terminal:
self.done = False
if pass_flag:
return self.state.board.encode(), _pass_action(self.board_size), 0., False, {'state': self.state}, current_score
else:
return self.state.board.encode(), action, 0., False, {'state': self.state}, current_score
# We're in a terminal state. Reward is 1 if won, -1 if lost
assert self.state.board.is_terminal
self.done = True
# white_wins = self.state.board.official_score > 0
# black_wins = self.state.board.official_score < 0
white_wins = self.state.board.official_score + self.komi > 0
black_wins = self.state.board.official_score + self.komi < 0
player_wins = (white_wins and self.player_color == pachi_py.WHITE) or (black_wins and self.player_color == pachi_py.BLACK)
reward = 1. if player_wins else -1. if (white_wins or black_wins) else 0.
return self.state.board.encode(), action, reward, True, {'state': self.state}, current_score
# Given an observation checks if an action is valid or not
def is_legal_action_old(self, obs, action, cur_player_color):
# If pass or resign
if action == _pass_action(self.board_size) or action == _resign_action(self.board_size):
return True
# Action should in valid range
if action < 0 or action > _resign_action(self.board_size) + 1:
return False
# get cordinates
a_x, a_y = (action // self.board_size, action % self.board_size)
# If position is occupied
if not np.all(obs[:, a_x, a_y] == self.EMPTY):
return False
# Illegal only if all surrounding places are of different color
opp_player = self.BLACK if cur_player_color == 2 else self.WHITE
if a_x > 0 and not np.all(obs[:, a_x-1, a_y] == opp_player):
return True
if a_x != self.board_size-1 and not np.all(obs[:, a_x+1, a_y] == opp_player):
return True
if a_y > 0 and not np.all(obs[:, a_x, a_y-1] == opp_player):
return True
if a_y != self.board_size-1 and not np.all(obs[:, a_x, a_y+1] == opp_player):
return True
return False
# Takes in self.env.state and action to take, player color is assumed to be correct
def is_legal_action(self, action):
temp_state = copy(self.state)
try:
temp_state.act(action)
except pachi_py.IllegalMove:
# print("PAchi's illegal move")
return False
return True
def _exec_opponent_play(self, curr_state, prev_state, prev_action):
assert curr_state.color != self.player_color
opponent_action = self.opponent_policy(curr_state, prev_state, prev_action)
if self.last_player_passed and opponent_action == _pass_action(self.board_size):
return curr_state.act(opponent_action), False, True
if opponent_action == _pass_action(self.board_size):
self.last_player_passed = True
else:
self.last_player_passed = False
# Todo Add condition for illegal action in case of 2 player game
opponent_resigned = opponent_action == _resign_action(self.board_size)
return curr_state.act(opponent_action), opponent_resigned, False
def set_player_color(self, player_color):
self.player_color = player_color
@property
def _state(self):
return self.state
def _reset_opponent(self, board):
if self.opponent == 'random':
self.opponent_policy = make_random_policy(self.np_random)
elif self.opponent == 'pachi:uct:_2400':
self.opponent_policy = make_pachi_policy(board=board, engine_type=six.b('uct'), pachi_timestr=six.b('_2400')) # TODO: strength as argument
else:
raise error.Error('Unrecognized opponent policy {}'.format(self.opponent))