Bug fixes to the RL algorithm and some tests

This commit is contained in:
Rohit Pai 2023-09-06 15:06:20 +01:00
parent 1aa8ffa8fc
commit 6d4e364f8d
6 changed files with 124 additions and 45 deletions

View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" /> <excludeFolder url="file://$MODULE_DIR$/venv" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.11 (draughts)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="$USER_HOME$/anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

Binary file not shown.

45
main.py
View File

@ -1,6 +1,7 @@
import sys import sys
import pygame import pygame
import numpy as np
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from reinforcementLearning.ReinforcementLearning import ReinforcementLearning from reinforcementLearning.ReinforcementLearning import ReinforcementLearning
@ -178,23 +179,38 @@ def game(difficulty):
clock = pygame.time.Clock() clock = pygame.time.Clock()
gameManager = GameManager(WIN, GREEN) gameManager = GameManager(WIN, GREEN)
rl = ReinforcementLearning(gameManager.board.getAllMoves(WHITE), gameManager.board, WHITE, gameManager) rl = ReinforcementLearning(gameManager.board.getAllMoves(WHITE), gameManager.board, WHITE, gameManager)
model = rl.buildMainModel()
model.load_weights("./modelWeights/model_final.h5")
mm = MiniMax() mm = MiniMax()
totalReward = [] totalReward = []
for i in range(2000): winners = []
for i in range(100):
score = 0 score = 0
for j in range(200): for j in range(200):
print(j)
clock.tick(FPS) clock.tick(FPS)
reward = 0 reward = 0
if gameManager.turn == WHITE: if gameManager.turn == WHITE:
mm = MiniMax() # mm = MiniMax()
value, newBoard = mm.AI(difficulty, WHITE, gameManager) # value, newBoard = mm.AI(difficulty, WHITE, gameManager)
# gameManager.aiMove(newBoard) # gameManager.aiMove(newBoard)
# reward, newBoard = rl.AI(gameManager.board) # reward, newBoard = rl.AI(gameManager.board)
if newBoard is None: actionSpace = rl.encodeMoves(WHITE, gameManager.board)
if len(actionSpace) == 0:
print("Cannot make move") print("Cannot make move")
continue continue
totalMoves = len(actionSpace)
# moves = np.squeeze(moves)
moves = np.pad(actionSpace, (0, rl.maxSize - totalMoves), 'constant', constant_values=(1, 1))
act_values = model.predict(rl.normalise(moves))
val = np.argmax(act_values[0])
val = val if val < totalMoves else totalMoves - 1
reward, newBoard, done = gameManager.board.step(actionSpace[val], WHITE)
# if newBoard is None:
# print("Cannot make move")
# continue
gameManager.aiMove(newBoard) gameManager.aiMove(newBoard)
#
gameManager.update() gameManager.update()
pygame.display.update() pygame.display.update()
@ -206,7 +222,10 @@ def game(difficulty):
score += reward score += reward
if gameManager.winner() is not None: if gameManager.winner() is not None:
print(gameManager.winner()) print("Green" if gameManager.winner() == GREEN else "White", " wins")
with open("winners.txt", "a+") as f:
f.write(str(gameManager.winner()) + "\n")
winners.append(gameManager.winner())
break break
# for event in pygame.event.get(): # for event in pygame.event.get():
@ -221,9 +240,16 @@ def game(difficulty):
gameManager.update() gameManager.update()
pygame.display.update() pygame.display.update()
if gameManager.winner() is None:
with open("winners.txt", "a+") as f:
f.write(str(0) + "\n")
winners.append(0)
gameManager.reset() gameManager.reset()
rl.resetScore() rl.resetScore()
print("Game: ", i, " Reward: ", score) print("Game: ", i, " Reward: ", score)
with open("rewards.txt", "a+") as f:
f.write(str(score) + "\n")
totalReward.append(score) totalReward.append(score)
# save model weights every 25 games # save model weights every 25 games
if i % 250 == 0 and i != 0: if i % 250 == 0 and i != 0:
@ -237,5 +263,12 @@ def game(difficulty):
plt.ylabel("Reward") plt.ylabel("Reward")
plt.show() plt.show()
fig, ax = plt.subplots()
bar = ax.bar(["Draw", "White", "Green"], [winners.count(0), winners.count(WHITE), winners.count(GREEN)])
ax.set(xlabel='Winner', ylabel='Frequency', ylim=[0, 500])
ax.set_title("Winners")
ax.bar_label(bar)
plt.show()
main(3) main(3)

View File

@ -37,9 +37,10 @@ class ReinforcementLearning():
self.maxSize = 32 self.maxSize = 32
self.epsilonMin = .01 self.epsilonMin = .01
self.epsilonDecay = .995 self.epsilonDecay = .995
self.learningRate = 0.001 self.learningRate = 0.0001
self.memory = deque(maxlen=10000000) self.memory = deque(maxlen=10000000)
self.model = self._buildMainModel() self.model = self.buildMainModel()
print(self.model.summary())
def AI(self, board: Board) -> tuple: def AI(self, board: Board) -> tuple:
""" """
@ -48,7 +49,7 @@ class ReinforcementLearning():
""" """
self.board = board self.board = board
self.state = self._convertState(self.board.board) self.state = self._convertState(self.board.board)
self.actionSpace = self._encodeMoves(self.colour, self.board) self.actionSpace = self.encodeMoves(self.colour, self.board)
if len(self.actionSpace) == 0: if len(self.actionSpace) == 0:
return self.score, None return self.score, None
@ -61,7 +62,7 @@ class ReinforcementLearning():
return self.score, nextState return self.score, nextState
def _buildMainModel(self) -> Sequential: def buildMainModel(self) -> Sequential:
""" """
Build the model for the AI Build the model for the AI
:return: the model :return: the model
@ -69,26 +70,24 @@ class ReinforcementLearning():
# Board model # Board model
modelLayers = [ modelLayers = [
Lambda(lambda x: tf.reshape(x, [-1, 32])), Lambda(lambda x: tf.reshape(x, [-1, 32])),
Dense(256, activation='relu'), Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2), Dropout(0.2),
Dense(128, activation='relu'), Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2), Dropout(0.2),
Dense(64, activation='relu'), Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2),
Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2), Dropout(0.2),
Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
Dropout(0.2), Dropout(0.2),
Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)), Dense(16, activation='linear', kernel_regularizer=regularizers.l2(0.01))
Dropout(0.2),
Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01))
] ]
boardModel = Sequential(modelLayers) boardModel = Sequential(modelLayers)
# boardModel.add(BatchNormalization()) # boardModel.add(BatchNormalization())
boardModel.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error') boardModel.compile(optimizer=Adam(learning_rate=self.learningRate), loss='mean_squared_error')
boardModel.build(input_shape=(None, None)) boardModel.build(input_shape=(None, None))
print(boardModel.summary())
return boardModel return boardModel
def _replay(self) -> None: def _replay(self) -> None:
@ -111,7 +110,7 @@ class ReinforcementLearning():
# Encoded moves # Encoded moves
encodedMoves = [] encodedMoves = []
for state in states: for state in states:
encodedMoves.append(self._encodeMoves(self.colour, state)) encodedMoves.append(self.encodeMoves(self.colour, state))
# Calculate targets # Calculate targets
targets = [] targets = []
@ -126,7 +125,7 @@ class ReinforcementLearning():
encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1)) encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1))
for m in encodedMoves]) for m in encodedMoves])
targets = np.array(targets) targets = np.array(targets)
self.model.fit(self._normalise(encodedMoves), self._normalise(targets), epochs=20) self.model.fit(self.normalise(encodedMoves), self.normalise(targets), epochs=20)
if self.epsilon > self.epsilonMin: if self.epsilon > self.epsilonMin:
self.epsilon *= self.epsilonDecay self.epsilon *= self.epsilonDecay
@ -160,8 +159,10 @@ class ReinforcementLearning():
return self.actionSpace[0] return self.actionSpace[0]
encodedMoves = np.squeeze(self.actionSpace) encodedMoves = np.squeeze(self.actionSpace)
encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)) encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))
act_values = self.model.predict(self._normalise(encodedMoves)) act_values = self.model.predict(self.normalise(encodedMoves))
return self.actionSpace[np.argmax(act_values[0])] val = np.argmax(act_values[0])
val = val if val < len(self.actionSpace) else len(self.actionSpace) - 1
return self.actionSpace[val]
def resetScore(self): def resetScore(self):
self.score = 0 self.score = 0
@ -209,20 +210,14 @@ class ReinforcementLearning():
def _maxNextQ(self) -> float: def _maxNextQ(self) -> float:
colour = WHITE if self.colour == GREEN else GREEN colour = WHITE if self.colour == GREEN else GREEN
encodedMoves = self._encodeMoves(colour, self.board) encodedMoves = self.encodeMoves(colour, self.board)
if len(encodedMoves) == 0: if len(encodedMoves) == 0:
return -1 return -1
paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))) paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)))
# paddedMoves = np.reshape(paddedMoves, (32, 8, 8)) nextQValues = self.model.predict_on_batch(self.normalise(paddedMoves))
# paddedMoves = paddedMoves / np.max(paddedMoved
# paddedMoves = paddedMoves.reshape(32,)
# pm = tf.convert_to_tensor(paddedMoves, dtype=tf.float32)
# pm = tf.reshape(pm, [32])
print(paddedMoves.shape)
nextQValues = self.model.predict_on_batch(self._normalise(paddedMoves))
return np.max(nextQValues) return np.max(nextQValues)
def _encodeMoves(self, colour: int, board: Board) -> list: def encodeMoves(self, colour: int, board: Board) -> list:
""" """
Encodes the moves into a list encoded moves Encodes the moves into a list encoded moves
:param colour: colour of the player :param colour: colour of the player
@ -243,10 +238,8 @@ class ReinforcementLearning():
diff = np.nonzero(diff) diff = np.nonzero(diff)
return diff return diff
def _normalise(self, data): def normalise(self, data):
""" """
Normalise the data Normalise the data
""" """
for i in range(len(data)): return data / 10000
data[i] = data[i] / np.linalg.norm(data[i])
return data

27
results.py Normal file
View File

@ -0,0 +1,27 @@
import matplotlib.pyplot as plt
from utilities.constants import GREEN, WHITE
# winners = []
with open("winners.txt") as f:
winners = f.readlines()
winners = [int(x.strip()) for x in winners]
fig, ax = plt.subplots()
bar = ax.bar(["Draw", "White", "Green"], [winners.count(0), winners.count(WHITE), winners.count(GREEN)])
ax.set(xlabel='Winner', ylabel='Frequency', ylim=[0, 500])
ax.set_title("Winners")
ax.bar_label(bar)
plt.show()
with open("rewardsA.txt") as f:
totalReward = f.readlines()
plt.plot([i for i in range(len(totalReward))], totalReward)
plt.xlabel("Games")
plt.ylabel("Reward")
plt.show()

View File

@ -63,19 +63,44 @@ class Board:
if piece != 0: if piece != 0:
if piece.colour == GREEN: if piece.colour == GREEN:
self.greenLeft -= 1 self.greenLeft -= 1
return continue
self.whiteLeft -= 1 self.whiteLeft -= 1
def getAllMoves(self, colour): def getAllMoves(self, colour):
moves = [] moves = []
possibleMoves = []
possiblePieces = []
pieces = self.getAllPieces(colour)
hasForcedCapture = False
for piece in self.getAllPieces(colour): for piece in pieces:
validMoves = self.getValidMoves(piece) validMoves = self.getValidMoves(piece)
for move, skip in validMoves.items():
tempBoard = deepcopy(self) # Check if there are forced capture moves for this piece
tempPiece = tempBoard.getPiece(piece.row, piece.col) forcedCaptureMoves = [move for move, skip in validMoves.items() if skip]
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip) if forcedCaptureMoves:
moves.append(newBoard) hasForcedCapture = True
possiblePieces.append(piece)
possibleMoves.append({move: skip for move, skip in validMoves.items() if skip})
if hasForcedCapture:
# If there are forced capture moves, consider only those
for i in range(len(possibleMoves)):
for move, skip in possibleMoves[i].items():
tempBoard = deepcopy(self)
tempPiece = tempBoard.getPiece(possiblePieces[i].row, possiblePieces[i].col)
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip)
moves.append(newBoard)
else:
# If no forced capture moves, consider all valid moves
for piece in pieces:
validMoves = self.getValidMoves(piece)
for move, skip in validMoves.items():
tempBoard = deepcopy(self)
tempPiece = tempBoard.getPiece(piece.row, piece.col)
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip)
moves.append(newBoard)
return moves return moves
def _simulateMove(self, piece, move, board, skip): def _simulateMove(self, piece, move, board, skip):
@ -134,6 +159,7 @@ class Board:
forcedCapture = forced forcedCapture = forced
else: else:
forcedCapture = forced forcedCapture = forced
return forcedCapture return forcedCapture
def scoreOfTheBoard(self): def scoreOfTheBoard(self):
@ -241,7 +267,7 @@ class Board:
def _decode(self, move): def _decode(self, move):
# Split digits back out # Split digits back out
str_code = str(move) str_code = str(move)
print(str_code) # print(str_code)
start_row = int(str_code[0]) start_row = int(str_code[0])
start_col = int(str_code[1]) start_col = int(str_code[1])
end_row = int(str_code[2]) end_row = int(str_code[2])