import random from collections import deque from typing import Any from copy import deepcopy import numpy as np import tensorflow as tf from keras.engine.input_layer import InputLayer from keras.layers import BatchNormalization from tensorflow.python.keras import Sequential, regularizers, Input from tensorflow.python.keras.layers import Dense, Lambda, Dropout from tensorflow.python.keras.optimizer_v2.adam import Adam from minimax.minimaxAlgo import MiniMax from utilities import Board from utilities.constants import WHITE, GREEN from utilities.gameManager import GameManager class ReinforcementLearning(): def __init__(self, actionSpace: list, board: Board, colour: int, gameManager: GameManager) -> None: """ Constructor for the ReinforcementLearning class :param actionSpace: The number of possible actions :param board: The game board """ self.gameManager = gameManager self.actionSpace = actionSpace self.board = board self.state = self.board.board self.colour = colour self.score = 0 self.epsilon = 1 self.gamma = .95 self.batchSize = 512 self.maxSize = 32 self.epsilonMin = .01 self.epsilonDecay = .995 self.learningRate = 0.0001 self.memory = deque(maxlen=10000000) self.model = self.buildMainModel() print(self.model.summary()) def AITrain(self, board: Board) -> tuple: """ Learns to play the draughts game :return: The loss """ self.board = board self.state = self._convertState(self.board.board) self.actionSpace = self.encodeMoves(self.colour, self.board) if len(self.actionSpace) == 0: return self.score, None action = self._act() reward, nextState, done = self.board.step(action, self.colour) self.score += reward self.state = self._convertState(nextState.board) self._remember(deepcopy(self.board), action, reward, self.state, done) self._replay() return self.score, nextState def AITest(self, board: Board) -> Board: """ Runs the AI :param board: The board :return: The new board """ actionSpace = self.encodeMoves(WHITE, board) if len(actionSpace) == 0: print("Cannot make move") return None totalMoves = len(actionSpace) # moves = np.squeeze(moves) moves = np.pad(actionSpace, (0, self.maxSize - totalMoves), 'constant', constant_values=(1, 1)) act_values = self.model.predict(self.normalise(moves)) val = np.argmax(act_values[0]) val = val if val < totalMoves else totalMoves - 1 reward, newBoard, done = board.step(actionSpace[val], WHITE) return newBoard def buildMainModel(self) -> Sequential: """ Build the model for the AI :return: The model """ # Board model modelLayers = [ Lambda(lambda x: tf.reshape(x, [-1, 32])), Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dropout(0.2), Dense(16, activation='linear', kernel_regularizer=regularizers.l2(0.01)) ] boardModel = Sequential(modelLayers) # boardModel.add(BatchNormalization()) boardModel.compile(optimizer=Adam(learning_rate=self.learningRate), loss='mean_squared_error') boardModel.build(input_shape=(None, None)) return boardModel def _replay(self) -> None: """ trains the model :return: None """ if len(self.memory) < self.batchSize: # Not enough data to replay and test the model return # Get a random sample from the memory minibatch = random.sample(self.memory, int(self.maxSize)) # Extract states, rewards, dones states = [m[0] for m in minibatch] rewards = [m[2] for m in minibatch] dones = [m[4] for m in minibatch] # Encoded moves encodedMoves = [] for state in states: encodedMoves.append(self.encodeMoves(self.colour, state)) # Calculate targets targets = [] for i, moves in enumerate(encodedMoves): if dones[i]: target = rewards[i] else: target = rewards[i] + self.gamma * self._maxNextQ() targets.append(target) encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1)) for m in encodedMoves]) targets = np.array(targets) self.model.fit(self.normalise(encodedMoves), self.normalise(targets), epochs=20) if self.epsilon > self.epsilonMin: self.epsilon *= self.epsilonDecay def _remember(self, state: np.array, action: int, reward: float, nextState: np.array, done: bool) -> None: """ Remembers what it has learnt :param state: The current state :param action: The action taken :param reward: The reward for the action :param nextState: The next state :param done: Whether the game is finished :return: None """ self.memory.append((state, action, reward, nextState, done)) def _act(self) -> Any: """ Chooses an action based on the available moves :return: The action """ if np.random.rand() <= self.epsilon: # choose a random action from the action spaces list mm = MiniMax() value, newBoard = mm.AI(3, self.colour, self.gameManager) if newBoard is None: return random.choice(self.actionSpace) where = self._boardDiff(self.board, newBoard) return self._encode(where[0]+1, where[1]+1) if len(self.actionSpace) == 1: return self.actionSpace[0] encodedMoves = np.squeeze(self.actionSpace) encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)) actValues = self.model.predict(self.normalise(encodedMoves)) val = np.argmax(actValues[0]) val = val if val < len(self.actionSpace) else len(self.actionSpace) - 1 return self.actionSpace[val] def resetScore(self) -> None: """ Resets the score :return: None """ self.score = 0 def _convertState(self, board: list) -> list: """ Converts the board into a 2D list of numbers :param board: 2D list of pieces :return: new 2D list of numbers """ num_board = [] for row in board: num_row = [] for piece in row: if piece == 0: num_row.append(0) continue if piece.colour == 1: num_row.append(1) continue num_row.append(2) num_board.append(num_row) return num_board def _encode(self, start: tuple, end: tuple) -> int: """ Encodes the move into an integer :param start: Tuple of start position :param end: Tuple of end position :return: Encoded move """ start_row = start[0] start_col = end[0] end_row = start[-1] end_col = end[-1] # Concatenate into integer return int(str(start_row) + str(start_col) + str(end_row) + str(end_col)) def _maxNextQ(self) -> float: """ Calculates the max Q value for the next state :return: the max Q value """ colour = WHITE if self.colour == GREEN else GREEN encodedMoves = self.encodeMoves(colour, self.board) if len(encodedMoves) == 0: return -1 paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))) nextQValues = self.model.predict_on_batch(self.normalise(paddedMoves)) return np.max(nextQValues) def encodeMoves(self, colour: int, board: Board) -> list: """ Encodes the moves into a list encoded moves :param colour: Colour of the player :param board: The board :return: list Of encoded moves """ encodedMoves = [] moves = board.getAllMoves(colour) for move in moves: where = self._boardDiff(board, move) encodedMoves.append(self._encode(where[0]+1, where[1]+1)) return encodedMoves def _boardDiff(self, board: Board, move: Board) -> np.array: """ Finds the difference between the two boards :param board: The current board :param move: The new board :return: the difference between the two boards """ cnvState = np.array(self._convertState(board.board)) cnvMove = np.array(self._convertState(move.board)) diff = np.subtract(cnvMove, cnvState) diff = np.nonzero(diff) return diff def normalise(self, data: np.array) -> np.array: """ Normalise the data :param data: the data to normalise :return: normalised data """ return data / 10000