import random from collections import deque from typing import Any from copy import deepcopy import numpy as np import tensorflow as tf from keras.engine.input_layer import InputLayer from keras.layers import BatchNormalization from tensorflow.python.keras import Sequential, regularizers, Input from tensorflow.python.keras.layers import Dense, Lambda, Dropout from tensorflow.python.keras.optimizer_v2.adam import Adam from minimax.minimaxAlgo import MiniMax from utilities import Board from utilities.constants import WHITE, GREEN from utilities.gameManager import GameManager class ReinforcementLearning(): def __init__(self, actionSpace: list, board: Board, colour: int, gameManager: GameManager) -> None: """ Constructor for the ReinforcementLearning class :param actionSpace: the number of possible actions :param board: the game board """ self.gameManager = gameManager self.actionSpace = actionSpace self.board = board self.state = self.board.board self.colour = colour self.score = 0 self.epsilon = 1 self.gamma = .95 self.batchSize = 256 self.maxSize = 32 self.epsilonMin = .01 self.epsilonDecay = .995 self.learningRate = 0.001 self.memory = deque(maxlen=10000000) self.model = self._buildMainModel() def AI(self, board: Board) -> tuple: """ Learns to play the draughts game :return: the loss """ self.board = board self.state = self._convertState(self.board.board) self.actionSpace = self._encodeMoves(self.colour, self.board) if len(self.actionSpace) == 0: return self.score, None action = self._act() reward, nextState, done = self.board.step(action, self.colour) self.score += reward self.state = self._convertState(nextState.board) self._remember(deepcopy(self.board), action, reward, self.state, done) self._replay() return self.score, nextState def _buildMainModel(self) -> Sequential: """ Build the model for the AI :return: the model """ # Board model modelLayers = [ Lambda(lambda x: tf.reshape(x, [-1, 32])), Dense(256, activation='relu'), Dropout(0.2), Dense(128, activation='relu'), Dropout(0.2), Dense(64, activation='relu'), Dropout(0.2), Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01)) ] boardModel = Sequential(modelLayers) # boardModel.add(BatchNormalization()) boardModel.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error') boardModel.build(input_shape=(None, None)) print(boardModel.summary()) return boardModel def _replay(self) -> None: """ trains the model :return: None (void) """ if len(self.memory) < self.batchSize: # Not enough data to replay and test the model return # Get a random sample from the memory minibatch = random.sample(self.memory, int(self.maxSize)) # Extract states, rewards, dones states = [m[0] for m in minibatch] rewards = [m[2] for m in minibatch] dones = [m[4] for m in minibatch] # Encoded moves encodedMoves = [] for state in states: encodedMoves.append(self._encodeMoves(self.colour, state)) # Calculate targets targets = [] for i, moves in enumerate(encodedMoves): if dones[i]: target = rewards[i] else: target = rewards[i] + self.gamma * self._maxNextQ() targets.append(target) encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1)) for m in encodedMoves]) targets = np.array(targets) self.model.fit(self._normalise(encodedMoves), self._normalise(targets), epochs=20) if self.epsilon > self.epsilonMin: self.epsilon *= self.epsilonDecay def _remember(self, state: np.array, action: int, reward: float, nextState: np.array, done: bool) -> None: """ Remembers what it has learnt :param state: the current state :param action: the action taken :param reward: the reward for the action :param nextState: the next state :param done: whether the game is finished :return: None (void) """ self.memory.append((state, action, reward, nextState, done)) def _act(self) -> Any: """ Chooses an action based on the available moves :return: the action """ if np.random.rand() <= self.epsilon: # choose a random action from the action spaces list mm = MiniMax() value, newBoard = mm.AI(3, self.colour, self.gameManager) if newBoard is None: return random.choice(self.actionSpace) where = self._boardDiff(self.board, newBoard) return self._encode(where[0]+1, where[1]+1) if len(self.actionSpace) == 1: return self.actionSpace[0] encodedMoves = np.squeeze(self.actionSpace) encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)) act_values = self.model.predict(self._normalise(encodedMoves)) return self.actionSpace[np.argmax(act_values[0])] def resetScore(self): self.score = 0 def _convertState(self, board: list) -> list: """ Converts the board into a 2D list of numbers :param board: 2D list of pieces :return: new 2D list of numbers """ num_board = [] for row in board: num_row = [] for piece in row: if piece == 0: num_row.append(0) continue if piece.colour == 1: num_row.append(1) continue num_row.append(2) num_board.append(num_row) return num_board def _encode(self, start: tuple, end: tuple) -> int: """ Encodes the move into an integer :param start: tuple of start position :param end: tuple of end position :return: encoded move """ start_row = start[0] start_col = end[0] end_row = start[-1] end_col = end[-1] # Concatenate into integer return int(str(start_row) + str(start_col) + str(end_row) + str(end_col)) def _maxNextQ(self) -> float: colour = WHITE if self.colour == GREEN else GREEN encodedMoves = self._encodeMoves(colour, self.board) if len(encodedMoves) == 0: return -1 paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))) # paddedMoves = np.reshape(paddedMoves, (32, 8, 8)) # paddedMoves = paddedMoves / np.max(paddedMoved # paddedMoves = paddedMoves.reshape(32,) # pm = tf.convert_to_tensor(paddedMoves, dtype=tf.float32) # pm = tf.reshape(pm, [32]) print(paddedMoves.shape) nextQValues = self.model.predict_on_batch(self._normalise(paddedMoves)) return np.max(nextQValues) def _encodeMoves(self, colour: int, board: Board) -> list: """ Encodes the moves into a list encoded moves :param colour: colour of the player :param board: the board :return: list of encoded moves """ encodedMoves = [] moves = board.getAllMoves(colour) for move in moves: where = self._boardDiff(board, move) encodedMoves.append(self._encode(where[0]+1, where[1]+1)) return encodedMoves def _boardDiff(self, board, move): cnvState = np.array(self._convertState(board.board)) cnvMove = np.array(self._convertState(move.board)) diff = np.subtract(cnvMove, cnvState) diff = np.nonzero(diff) return diff def _normalise(self, data): """ Normalise the data """ for i in range(len(data)): data[i] = data[i] / np.linalg.norm(data[i]) return data