标签:http size __init__ code input inpu ever 模型 who
Tic-Tac-Toe游戏为3*3格子里轮流下棋,一方先有3子成直线的为赢家。
参考代码如下,我只删除了几个没用的地方:
####################################################################### # Copyright (C) # # 2016 - 2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # # 2016 Jan Hakenberg(jan.hakenberg@gmail.com) # # 2016 Tian Jun(tianjun.cpp@gmail.com) # # 2016 Kenta Shimada(hyperkentakun@gmail.com) # # Permission given to modify the code as long as you keep this # # declaration at the top # ####################################################################### ##https://www.cnblogs.com/pinard/p/9385570.html ## ## 强化学习(一)模型基础 ## import numpy as np import pickle BOARD_ROWS = 3 BOARD_COLS = 3 BOARD_SIZE = BOARD_ROWS * BOARD_COLS
State状态类
简要描述:每个状态用自定义hash值描述,主要方法为get_all_states(运行一次得到所有状态)和next_state(下一次棋,返回新的状态)
class State: def __init__(self): # the board is represented by an n * n array, # 1 represents a chessman of the player who moves first, # -1 represents a chessman of another player # 0 represents an empty position self.data = np.zeros((BOARD_ROWS, BOARD_COLS)) self.winner = None self.hash_val = None self.end = None # compute the hash value for one state, it‘s unique def hash(self): if self.hash_val is None: self.hash_val = 0 for i in self.data.reshape(BOARD_ROWS * BOARD_COLS): # 即原来取值-1,0,1,现在将-1设置为2,为了hash方便 if i == -1: i = 2 self.hash_val = self.hash_val * 3 + i return int(self.hash_val) # check whether a player has won the game, or it‘s a tie def is_end(self): if self.end is not None: return self.end results = [] # check row for i in range(0, BOARD_ROWS): results.append(np.sum(self.data[i, :])) # check columns for i in range(0, BOARD_COLS): results.append(np.sum(self.data[:, i])) # check diagonals results.append(0) for i in range(0, BOARD_ROWS): results[-1] += self.data[i, i] results.append(0) for i in range(0, BOARD_ROWS): results[-1] += self.data[i, BOARD_ROWS - 1 - i] for result in results: if result == 3: self.winner = 1 self.end = True return self.end if result == -3: self.winner = -1 self.end = True return self.end # whether it‘s a tie sum = np.sum(np.abs(self.data)) if sum == BOARD_ROWS * BOARD_COLS: self.winner = 0 self.end = True return self.end # game is still going on self.end = False return self.end # @symbol: 1 or -1 # put chessman symbol in position (i, j) def next_state(self, i, j, symbol): new_state = State() new_state.data = np.copy(self.data) new_state.data[i, j] = symbol return new_state # print the board def print(self): for i in range(0, BOARD_ROWS): print(‘-------------‘) out = ‘| ‘ for j in range(0, BOARD_COLS): if self.data[i, j] == 1: token = ‘*‘ if self.data[i, j] == 0: token = ‘0‘ if self.data[i, j] == -1: token = ‘x‘ out += token + ‘ | ‘ print(out) print(‘-------------‘) def get_all_states_impl(current_state, current_symbol, all_states): ‘‘‘ all_states:字典,以hash值为key,value为(state,is_End) ‘‘‘ for i in range(0, BOARD_ROWS): for j in range(0, BOARD_COLS): if current_state.data[i][j] == 0: newState = current_state.next_state(i, j, current_symbol) newHash = newState.hash() if newHash not in all_states.keys(): isEnd = newState.is_end() all_states[newHash] = (newState, isEnd) #如果没结束对局,下一个选手继续下 if not isEnd: get_all_states_impl(newState, -current_symbol, all_states) def get_all_states(): current_symbol = 1 current_state = State() all_states = dict() all_states[current_state.hash()] = (current_state, current_state.is_end()) get_all_states_impl(current_state, current_symbol, all_states) return all_states # all possible board configurations all_states = get_all_states()
裁判:监督选手轮流下棋。主要方法为alternate(轮流选手),play(监督游戏执行,play里重要的为选手的act方法,后面讲)
class Judger: # @player1: the player who will move first, its chessman will be 1 # @player2: another player with a chessman -1 # @feedback: if True, both players will receive rewards when game is end def __init__(self, player1, player2): self.p1 = player1 self.p2 = player2 self.p1_symbol = 1 self.p2_symbol = -1 self.p1.set_symbol(self.p1_symbol) self.p2.set_symbol(self.p2_symbol) self.current_state = State() def reset(self): self.p1.reset() self.p2.reset() def alternate(self): while True: yield self.p1 yield self.p2 # @print: if True, print each board during the game def play(self, print=False): alternator = self.alternate() self.reset() current_state=self.current_state self.p1.set_state(current_state) self.p2.set_state(current_state) while True: player = next(alternator) if print: current_state.print() [i, j, symbol] = player.act() next_state_hash = current_state.next_state(i, j, symbol).hash() current_state, is_end = all_states[next_state_hash] self.p1.set_state(current_state) self.p2.set_state(current_state) if is_end: if print: current_state.print() return current_state.winner
AI选手:estimations表示不同状态下的分值,用以进行下一状态的选择,greedy区分随机行为,即随机行为不参与更新状态的分值
主要方法为set_symbol(设置对于每个选手各状态分值的初始值),backup(更新状态分值,如果下一状态分值更高,那么当前状态的分值也要提高,即将长远的结果反作用到现在),act(获取下一步坐标)
class Player: # @step_size: the step size to update estimations # @epsilon: the probability to explore def __init__(self, step_size=0.1, epsilon=0.1): self.estimations = dict() self.step_size = step_size self.epsilon = epsilon self.states = [] self.greedy = [] def reset(self): self.states = [] self.greedy = [] def set_state(self, state): self.states.append(state) self.greedy.append(True) def set_symbol(self, symbol): self.symbol = symbol # 对状态分值初始化,最终赢了得1分,输了不得分,平局0.5分,未到终局设置为0.5分 for hash_val in all_states.keys(): (state, is_end) = all_states[hash_val] if is_end: if state.winner == self.symbol: self.estimations[hash_val] = 1.0 elif state.winner == 0: # we need to distinguish between a tie and a lose self.estimations[hash_val] = 0.5 else: self.estimations[hash_val] = 0 else: self.estimations[hash_val] = 0.5 # update value estimation def backup(self): # for debug # print(‘player trajectory‘) # for state in self.states: # state.print() self.states = [state.hash() for state in self.states] # 顺序更新 for i in reversed(range(len(self.states) - 1)): state = self.states[i] td_error = self.greedy[i] * (self.estimations[self.states[i + 1]] - self.estimations[state]) self.estimations[state] += self.step_size * td_error # choose an action based on the state def act(self): #取出当前(最后一个)状态 state = self.states[-1] #下一步可能的状态的hash next_states = [] #下一步可能的坐标 next_positions = [] for i in range(BOARD_ROWS): for j in range(BOARD_COLS): if state.data[i, j] == 0: next_positions.append([i, j]) next_states.append(state.next_state(i, j, self.symbol).hash()) #小概率随机探索 if np.random.rand() < self.epsilon: action = next_positions[np.random.randint(len(next_positions))] action.append(self.symbol) # 表示随机行为不参与价值更新 self.greedy[-1] = False return action #大概率按奖励最高行动 values = [] for hash, pos in zip(next_states, next_positions): values.append((self.estimations[hash], pos)) values.sort(key=lambda x: x[0], reverse=True) action = values[0][1] action.append(self.symbol) return action def save_policy(self): with open(‘policy_%s.bin‘ % (‘first‘ if self.symbol == 1 else ‘second‘), ‘wb‘) as f: pickle.dump(self.estimations, f) def load_policy(self): with open(‘policy_%s.bin‘ % (‘first‘ if self.symbol == 1 else ‘second‘), ‘rb‘) as f: self.estimations = pickle.load(f)
人类选手:act方法为自己下棋
# human interface # input a number to put a chessman # | q | w | e | # | a | s | d | # | z | x | c | class HumanPlayer: def __init__(self, **kwargs): self.symbol = None self.keys = [‘q‘, ‘w‘, ‘e‘, ‘a‘, ‘s‘, ‘d‘, ‘z‘, ‘x‘, ‘c‘] self.state = None return def reset(self): return def set_state(self, state): self.state = state def set_symbol(self, symbol): self.symbol = symbol return def backup(self, _): return def act(self): self.state.print() key = input("Input your position:") data = self.keys.index(key) i = data // int(BOARD_COLS) j = data % BOARD_COLS return (i, j, self.symbol)
训练:
def train(epochs): player1 = Player(epsilon=0.01) player2 = Player(epsilon=0.01) judger = Judger(player1, player2) player1_win = 0.0 player2_win = 0.0 for i in range(1, epochs + 1): winner = judger.play(print=False) if winner == 1: player1_win += 1 if winner == -1: player2_win += 1
# 输出2个选手的获胜概率,到最后基本是平局 if i%100==0: print(‘Epoch %d, player 1 win %.02f, player 2 win %.02f‘ % (i, player1_win / i, player2_win / i)) player1.backup() player2.backup() player1.save_policy() # 保存状态价值,其实训练获取的就是各状态分别对每个选手的价值 player2.save_policy()
AI自测:
def compete(turns): # 不允许随机行为 player1 = Player(epsilon=0) player2 = Player(epsilon=0) judger = Judger(player1, player2) player1.load_policy() player2.load_policy() player1_win = 0.0 player2_win = 0.0 for i in range(0, turns): winner = judger.play() if winner == 1: player1_win += 1 if winner == -1: player2_win += 1 #judger.reset() print(‘%d turns, player 1 win %.02f, player 2 win %.02f‘ % (turns, player1_win / turns, player2_win / turns))
人机大战:
def play(): while True: player1 = HumanPlayer() player2 = Player(epsilon=0) judger = Judger(player1, player2) player2.load_policy() winner = judger.play() if winner == player2.symbol: print("You lose!") elif winner == player1.symbol: print("You win!") else: print("It is a tie!")
开始!
if __name__ == ‘__main__‘: train(int(1e4)) compete(int(1e3)) play()
训练结束后,战绩为Epoch 10000, player 1 win 0.08, player 2 win 0.03
因为此时有一定随机行为(1%)
当AI自测时,去除了随机性,结果为1000 turns, player 1 win 0.00, player 2 win 0.00
可以看到,都是平局
后面就是人机大战了,根本赢不了这个AI的。
标签:http size __init__ code input inpu ever 模型 who
原文地址:https://www.cnblogs.com/lunge-blog/p/11688543.html