masters-dissertation/reinforcementLearning/ReinforcementLearning.py

253 lines
8.5 KiB
Python

import random
from collections import deque
from typing import Any
from copy import deepcopy
import numpy as np
import tensorflow as tf
from keras.engine.input_layer import InputLayer
from keras.layers import BatchNormalization
from tensorflow.python.keras import Sequential, regularizers, Input
from tensorflow.python.keras.layers import Dense, Lambda, Dropout
from tensorflow.python.keras.optimizer_v2.adam import Adam
from minimax.minimaxAlgo import MiniMax
from utilities import Board
from utilities.constants import WHITE, GREEN
from utilities.gameManager import GameManager
class ReinforcementLearning():
def __init__(self, actionSpace: list, board: Board, colour: int, gameManager: GameManager) -> None:
"""
Constructor for the ReinforcementLearning class
:param actionSpace: the number of possible actions
:param board: the game board
"""
self.gameManager = gameManager
self.actionSpace = actionSpace
self.board = board
self.state = self.board.board
self.colour = colour
self.score = 0
self.epsilon = 1
self.gamma = .95
self.batchSize = 256
self.maxSize = 32
self.epsilonMin = .01
self.epsilonDecay = .995
self.learningRate = 0.001
self.memory = deque(maxlen=10000000)
self.model = self._buildMainModel()
def AI(self, board: Board) -> tuple:
"""
Learns to play the draughts game
:return: the loss
"""
self.board = board
self.state = self._convertState(self.board.board)
self.actionSpace = self._encodeMoves(self.colour, self.board)
if len(self.actionSpace) == 0:
return self.score, None
action = self._act()
reward, nextState, done = self.board.step(action, self.colour)
self.score += reward
self.state = self._convertState(nextState.board)
self._remember(deepcopy(self.board), action, reward, self.state, done)
self._replay()
return self.score, nextState
def _buildMainModel(self) -> Sequential:
"""
Build the model for the AI
:return: the model
"""
# Board model
modelLayers = [
Lambda(lambda x: tf.reshape(x, [-1, 32])),
Dense(256, activation='relu'),
Dropout(0.2),
Dense(128, activation='relu'),
Dropout(0.2),
Dense(64, activation='relu'),
Dropout(0.2),
Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2),
Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2),
Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01))
]
boardModel = Sequential(modelLayers)
# boardModel.add(BatchNormalization())
boardModel.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')
boardModel.build(input_shape=(None, None))
print(boardModel.summary())
return boardModel
def _replay(self) -> None:
"""
trains the model
:return: None (void)
"""
if len(self.memory) < self.batchSize:
# Not enough data to replay and test the model
return
# Get a random sample from the memory
minibatch = random.sample(self.memory, int(self.maxSize))
# Extract states, rewards, dones
states = [m[0] for m in minibatch]
rewards = [m[2] for m in minibatch]
dones = [m[4] for m in minibatch]
# Encoded moves
encodedMoves = []
for state in states:
encodedMoves.append(self._encodeMoves(self.colour, state))
# Calculate targets
targets = []
for i, moves in enumerate(encodedMoves):
if dones[i]:
target = rewards[i]
else:
target = rewards[i] + self.gamma * self._maxNextQ()
targets.append(target)
encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1))
for m in encodedMoves])
targets = np.array(targets)
self.model.fit(self._normalise(encodedMoves), self._normalise(targets), epochs=20)
if self.epsilon > self.epsilonMin:
self.epsilon *= self.epsilonDecay
def _remember(self, state: np.array, action: int, reward: float, nextState: np.array, done: bool) -> None:
"""
Remembers what it has learnt
:param state: the current state
:param action: the action taken
:param reward: the reward for the action
:param nextState: the next state
:param done: whether the game is finished
:return: None (void)
"""
self.memory.append((state, action, reward, nextState, done))
def _act(self) -> Any:
"""
Chooses an action based on the available moves
:return: the action
"""
if np.random.rand() <= self.epsilon:
# choose a random action from the action spaces list
mm = MiniMax()
value, newBoard = mm.AI(3, self.colour, self.gameManager)
if newBoard is None:
return random.choice(self.actionSpace)
where = self._boardDiff(self.board, newBoard)
return self._encode(where[0]+1, where[1]+1)
if len(self.actionSpace) == 1:
return self.actionSpace[0]
encodedMoves = np.squeeze(self.actionSpace)
encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))
act_values = self.model.predict(self._normalise(encodedMoves))
return self.actionSpace[np.argmax(act_values[0])]
def resetScore(self):
self.score = 0
def _convertState(self, board: list) -> list:
"""
Converts the board into a 2D list of numbers
:param board: 2D list of pieces
:return: new 2D list of numbers
"""
num_board = []
for row in board:
num_row = []
for piece in row:
if piece == 0:
num_row.append(0)
continue
if piece.colour == 1:
num_row.append(1)
continue
num_row.append(2)
num_board.append(num_row)
return num_board
def _encode(self, start: tuple, end: tuple) -> int:
"""
Encodes the move into an integer
:param start: tuple of start position
:param end: tuple of end position
:return: encoded move
"""
start_row = start[0]
start_col = end[0]
end_row = start[-1]
end_col = end[-1]
# Concatenate into integer
return int(str(start_row) + str(start_col) + str(end_row) + str(end_col))
def _maxNextQ(self) -> float:
colour = WHITE if self.colour == GREEN else GREEN
encodedMoves = self._encodeMoves(colour, self.board)
if len(encodedMoves) == 0:
return -1
paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)))
# paddedMoves = np.reshape(paddedMoves, (32, 8, 8))
# paddedMoves = paddedMoves / np.max(paddedMoved
# paddedMoves = paddedMoves.reshape(32,)
# pm = tf.convert_to_tensor(paddedMoves, dtype=tf.float32)
# pm = tf.reshape(pm, [32])
print(paddedMoves.shape)
nextQValues = self.model.predict_on_batch(self._normalise(paddedMoves))
return np.max(nextQValues)
def _encodeMoves(self, colour: int, board: Board) -> list:
"""
Encodes the moves into a list encoded moves
:param colour: colour of the player
:param board: the board
:return: list of encoded moves
"""
encodedMoves = []
moves = board.getAllMoves(colour)
for move in moves:
where = self._boardDiff(board, move)
encodedMoves.append(self._encode(where[0]+1, where[1]+1))
return encodedMoves
def _boardDiff(self, board, move):
cnvState = np.array(self._convertState(board.board))
cnvMove = np.array(self._convertState(move.board))
diff = np.subtract(cnvMove, cnvState)
diff = np.nonzero(diff)
return diff
def _normalise(self, data):
"""
Normalise the data
"""
for i in range(len(data)):
data[i] = data[i] / np.linalg.norm(data[i])
return data