reinforcement-learning #1
@ -4,7 +4,7 @@
|
|||||||
<content url="file://$MODULE_DIR$">
|
<content url="file://$MODULE_DIR$">
|
||||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.11 (draughts)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="$USER_HOME$/anaconda3" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
BIN
Report.pdf
BIN
Report.pdf
Binary file not shown.
45
main.py
45
main.py
@ -1,6 +1,7 @@
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import pygame
|
import pygame
|
||||||
|
import numpy as np
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
from reinforcementLearning.ReinforcementLearning import ReinforcementLearning
|
from reinforcementLearning.ReinforcementLearning import ReinforcementLearning
|
||||||
@ -178,23 +179,38 @@ def game(difficulty):
|
|||||||
clock = pygame.time.Clock()
|
clock = pygame.time.Clock()
|
||||||
gameManager = GameManager(WIN, GREEN)
|
gameManager = GameManager(WIN, GREEN)
|
||||||
rl = ReinforcementLearning(gameManager.board.getAllMoves(WHITE), gameManager.board, WHITE, gameManager)
|
rl = ReinforcementLearning(gameManager.board.getAllMoves(WHITE), gameManager.board, WHITE, gameManager)
|
||||||
|
model = rl.buildMainModel()
|
||||||
|
model.load_weights("./modelWeights/model_final.h5")
|
||||||
mm = MiniMax()
|
mm = MiniMax()
|
||||||
totalReward = []
|
totalReward = []
|
||||||
for i in range(2000):
|
winners = []
|
||||||
|
for i in range(100):
|
||||||
score = 0
|
score = 0
|
||||||
for j in range(200):
|
for j in range(200):
|
||||||
|
print(j)
|
||||||
clock.tick(FPS)
|
clock.tick(FPS)
|
||||||
reward = 0
|
reward = 0
|
||||||
if gameManager.turn == WHITE:
|
if gameManager.turn == WHITE:
|
||||||
mm = MiniMax()
|
# mm = MiniMax()
|
||||||
value, newBoard = mm.AI(difficulty, WHITE, gameManager)
|
# value, newBoard = mm.AI(difficulty, WHITE, gameManager)
|
||||||
# gameManager.aiMove(newBoard)
|
# gameManager.aiMove(newBoard)
|
||||||
# reward, newBoard = rl.AI(gameManager.board)
|
# reward, newBoard = rl.AI(gameManager.board)
|
||||||
if newBoard is None:
|
actionSpace = rl.encodeMoves(WHITE, gameManager.board)
|
||||||
|
if len(actionSpace) == 0:
|
||||||
print("Cannot make move")
|
print("Cannot make move")
|
||||||
continue
|
continue
|
||||||
|
totalMoves = len(actionSpace)
|
||||||
|
# moves = np.squeeze(moves)
|
||||||
|
moves = np.pad(actionSpace, (0, rl.maxSize - totalMoves), 'constant', constant_values=(1, 1))
|
||||||
|
act_values = model.predict(rl.normalise(moves))
|
||||||
|
val = np.argmax(act_values[0])
|
||||||
|
val = val if val < totalMoves else totalMoves - 1
|
||||||
|
reward, newBoard, done = gameManager.board.step(actionSpace[val], WHITE)
|
||||||
|
|
||||||
|
# if newBoard is None:
|
||||||
|
# print("Cannot make move")
|
||||||
|
# continue
|
||||||
gameManager.aiMove(newBoard)
|
gameManager.aiMove(newBoard)
|
||||||
#
|
|
||||||
|
|
||||||
gameManager.update()
|
gameManager.update()
|
||||||
pygame.display.update()
|
pygame.display.update()
|
||||||
@ -206,7 +222,10 @@ def game(difficulty):
|
|||||||
score += reward
|
score += reward
|
||||||
|
|
||||||
if gameManager.winner() is not None:
|
if gameManager.winner() is not None:
|
||||||
print(gameManager.winner())
|
print("Green" if gameManager.winner() == GREEN else "White", " wins")
|
||||||
|
with open("winners.txt", "a+") as f:
|
||||||
|
f.write(str(gameManager.winner()) + "\n")
|
||||||
|
winners.append(gameManager.winner())
|
||||||
break
|
break
|
||||||
|
|
||||||
# for event in pygame.event.get():
|
# for event in pygame.event.get():
|
||||||
@ -221,9 +240,16 @@ def game(difficulty):
|
|||||||
gameManager.update()
|
gameManager.update()
|
||||||
pygame.display.update()
|
pygame.display.update()
|
||||||
|
|
||||||
|
if gameManager.winner() is None:
|
||||||
|
with open("winners.txt", "a+") as f:
|
||||||
|
f.write(str(0) + "\n")
|
||||||
|
winners.append(0)
|
||||||
gameManager.reset()
|
gameManager.reset()
|
||||||
rl.resetScore()
|
rl.resetScore()
|
||||||
print("Game: ", i, " Reward: ", score)
|
print("Game: ", i, " Reward: ", score)
|
||||||
|
with open("rewards.txt", "a+") as f:
|
||||||
|
f.write(str(score) + "\n")
|
||||||
|
|
||||||
totalReward.append(score)
|
totalReward.append(score)
|
||||||
# save model weights every 25 games
|
# save model weights every 25 games
|
||||||
if i % 250 == 0 and i != 0:
|
if i % 250 == 0 and i != 0:
|
||||||
@ -237,5 +263,12 @@ def game(difficulty):
|
|||||||
plt.ylabel("Reward")
|
plt.ylabel("Reward")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
bar = ax.bar(["Draw", "White", "Green"], [winners.count(0), winners.count(WHITE), winners.count(GREEN)])
|
||||||
|
ax.set(xlabel='Winner', ylabel='Frequency', ylim=[0, 500])
|
||||||
|
ax.set_title("Winners")
|
||||||
|
ax.bar_label(bar)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
main(3)
|
main(3)
|
||||||
|
@ -37,9 +37,10 @@ class ReinforcementLearning():
|
|||||||
self.maxSize = 32
|
self.maxSize = 32
|
||||||
self.epsilonMin = .01
|
self.epsilonMin = .01
|
||||||
self.epsilonDecay = .995
|
self.epsilonDecay = .995
|
||||||
self.learningRate = 0.001
|
self.learningRate = 0.0001
|
||||||
self.memory = deque(maxlen=10000000)
|
self.memory = deque(maxlen=10000000)
|
||||||
self.model = self._buildMainModel()
|
self.model = self.buildMainModel()
|
||||||
|
print(self.model.summary())
|
||||||
|
|
||||||
def AI(self, board: Board) -> tuple:
|
def AI(self, board: Board) -> tuple:
|
||||||
"""
|
"""
|
||||||
@ -48,7 +49,7 @@ class ReinforcementLearning():
|
|||||||
"""
|
"""
|
||||||
self.board = board
|
self.board = board
|
||||||
self.state = self._convertState(self.board.board)
|
self.state = self._convertState(self.board.board)
|
||||||
self.actionSpace = self._encodeMoves(self.colour, self.board)
|
self.actionSpace = self.encodeMoves(self.colour, self.board)
|
||||||
if len(self.actionSpace) == 0:
|
if len(self.actionSpace) == 0:
|
||||||
return self.score, None
|
return self.score, None
|
||||||
|
|
||||||
@ -61,7 +62,7 @@ class ReinforcementLearning():
|
|||||||
|
|
||||||
return self.score, nextState
|
return self.score, nextState
|
||||||
|
|
||||||
def _buildMainModel(self) -> Sequential:
|
def buildMainModel(self) -> Sequential:
|
||||||
"""
|
"""
|
||||||
Build the model for the AI
|
Build the model for the AI
|
||||||
:return: the model
|
:return: the model
|
||||||
@ -69,26 +70,24 @@ class ReinforcementLearning():
|
|||||||
# Board model
|
# Board model
|
||||||
modelLayers = [
|
modelLayers = [
|
||||||
Lambda(lambda x: tf.reshape(x, [-1, 32])),
|
Lambda(lambda x: tf.reshape(x, [-1, 32])),
|
||||||
Dense(256, activation='relu'),
|
Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||||
Dropout(0.2),
|
Dropout(0.2),
|
||||||
Dense(128, activation='relu'),
|
Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||||
Dropout(0.2),
|
Dropout(0.2),
|
||||||
Dense(64, activation='relu'),
|
Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||||
|
Dropout(0.2),
|
||||||
|
Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||||
Dropout(0.2),
|
Dropout(0.2),
|
||||||
Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||||
Dropout(0.2),
|
Dropout(0.2),
|
||||||
Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
Dense(16, activation='linear', kernel_regularizer=regularizers.l2(0.01))
|
||||||
Dropout(0.2),
|
|
||||||
Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01))
|
|
||||||
]
|
]
|
||||||
boardModel = Sequential(modelLayers)
|
boardModel = Sequential(modelLayers)
|
||||||
|
|
||||||
# boardModel.add(BatchNormalization())
|
# boardModel.add(BatchNormalization())
|
||||||
boardModel.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')
|
boardModel.compile(optimizer=Adam(learning_rate=self.learningRate), loss='mean_squared_error')
|
||||||
boardModel.build(input_shape=(None, None))
|
boardModel.build(input_shape=(None, None))
|
||||||
|
|
||||||
print(boardModel.summary())
|
|
||||||
|
|
||||||
return boardModel
|
return boardModel
|
||||||
|
|
||||||
def _replay(self) -> None:
|
def _replay(self) -> None:
|
||||||
@ -111,7 +110,7 @@ class ReinforcementLearning():
|
|||||||
# Encoded moves
|
# Encoded moves
|
||||||
encodedMoves = []
|
encodedMoves = []
|
||||||
for state in states:
|
for state in states:
|
||||||
encodedMoves.append(self._encodeMoves(self.colour, state))
|
encodedMoves.append(self.encodeMoves(self.colour, state))
|
||||||
|
|
||||||
# Calculate targets
|
# Calculate targets
|
||||||
targets = []
|
targets = []
|
||||||
@ -126,7 +125,7 @@ class ReinforcementLearning():
|
|||||||
encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1))
|
encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1))
|
||||||
for m in encodedMoves])
|
for m in encodedMoves])
|
||||||
targets = np.array(targets)
|
targets = np.array(targets)
|
||||||
self.model.fit(self._normalise(encodedMoves), self._normalise(targets), epochs=20)
|
self.model.fit(self.normalise(encodedMoves), self.normalise(targets), epochs=20)
|
||||||
if self.epsilon > self.epsilonMin:
|
if self.epsilon > self.epsilonMin:
|
||||||
self.epsilon *= self.epsilonDecay
|
self.epsilon *= self.epsilonDecay
|
||||||
|
|
||||||
@ -160,8 +159,10 @@ class ReinforcementLearning():
|
|||||||
return self.actionSpace[0]
|
return self.actionSpace[0]
|
||||||
encodedMoves = np.squeeze(self.actionSpace)
|
encodedMoves = np.squeeze(self.actionSpace)
|
||||||
encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))
|
encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))
|
||||||
act_values = self.model.predict(self._normalise(encodedMoves))
|
act_values = self.model.predict(self.normalise(encodedMoves))
|
||||||
return self.actionSpace[np.argmax(act_values[0])]
|
val = np.argmax(act_values[0])
|
||||||
|
val = val if val < len(self.actionSpace) else len(self.actionSpace) - 1
|
||||||
|
return self.actionSpace[val]
|
||||||
|
|
||||||
def resetScore(self):
|
def resetScore(self):
|
||||||
self.score = 0
|
self.score = 0
|
||||||
@ -209,20 +210,14 @@ class ReinforcementLearning():
|
|||||||
|
|
||||||
def _maxNextQ(self) -> float:
|
def _maxNextQ(self) -> float:
|
||||||
colour = WHITE if self.colour == GREEN else GREEN
|
colour = WHITE if self.colour == GREEN else GREEN
|
||||||
encodedMoves = self._encodeMoves(colour, self.board)
|
encodedMoves = self.encodeMoves(colour, self.board)
|
||||||
if len(encodedMoves) == 0:
|
if len(encodedMoves) == 0:
|
||||||
return -1
|
return -1
|
||||||
paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)))
|
paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)))
|
||||||
# paddedMoves = np.reshape(paddedMoves, (32, 8, 8))
|
nextQValues = self.model.predict_on_batch(self.normalise(paddedMoves))
|
||||||
# paddedMoves = paddedMoves / np.max(paddedMoved
|
|
||||||
# paddedMoves = paddedMoves.reshape(32,)
|
|
||||||
# pm = tf.convert_to_tensor(paddedMoves, dtype=tf.float32)
|
|
||||||
# pm = tf.reshape(pm, [32])
|
|
||||||
print(paddedMoves.shape)
|
|
||||||
nextQValues = self.model.predict_on_batch(self._normalise(paddedMoves))
|
|
||||||
return np.max(nextQValues)
|
return np.max(nextQValues)
|
||||||
|
|
||||||
def _encodeMoves(self, colour: int, board: Board) -> list:
|
def encodeMoves(self, colour: int, board: Board) -> list:
|
||||||
"""
|
"""
|
||||||
Encodes the moves into a list encoded moves
|
Encodes the moves into a list encoded moves
|
||||||
:param colour: colour of the player
|
:param colour: colour of the player
|
||||||
@ -243,10 +238,8 @@ class ReinforcementLearning():
|
|||||||
diff = np.nonzero(diff)
|
diff = np.nonzero(diff)
|
||||||
return diff
|
return diff
|
||||||
|
|
||||||
def _normalise(self, data):
|
def normalise(self, data):
|
||||||
"""
|
"""
|
||||||
Normalise the data
|
Normalise the data
|
||||||
"""
|
"""
|
||||||
for i in range(len(data)):
|
return data / 10000
|
||||||
data[i] = data[i] / np.linalg.norm(data[i])
|
|
||||||
return data
|
|
27
results.py
Normal file
27
results.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from utilities.constants import GREEN, WHITE
|
||||||
|
|
||||||
|
# winners = []
|
||||||
|
with open("winners.txt") as f:
|
||||||
|
winners = f.readlines()
|
||||||
|
|
||||||
|
winners = [int(x.strip()) for x in winners]
|
||||||
|
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
bar = ax.bar(["Draw", "White", "Green"], [winners.count(0), winners.count(WHITE), winners.count(GREEN)])
|
||||||
|
ax.set(xlabel='Winner', ylabel='Frequency', ylim=[0, 500])
|
||||||
|
ax.set_title("Winners")
|
||||||
|
ax.bar_label(bar)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
with open("rewardsA.txt") as f:
|
||||||
|
totalReward = f.readlines()
|
||||||
|
|
||||||
|
|
||||||
|
plt.plot([i for i in range(len(totalReward))], totalReward)
|
||||||
|
plt.xlabel("Games")
|
||||||
|
plt.ylabel("Reward")
|
||||||
|
plt.show()
|
@ -63,19 +63,44 @@ class Board:
|
|||||||
if piece != 0:
|
if piece != 0:
|
||||||
if piece.colour == GREEN:
|
if piece.colour == GREEN:
|
||||||
self.greenLeft -= 1
|
self.greenLeft -= 1
|
||||||
return
|
continue
|
||||||
self.whiteLeft -= 1
|
self.whiteLeft -= 1
|
||||||
|
|
||||||
def getAllMoves(self, colour):
|
def getAllMoves(self, colour):
|
||||||
moves = []
|
moves = []
|
||||||
|
possibleMoves = []
|
||||||
|
possiblePieces = []
|
||||||
|
pieces = self.getAllPieces(colour)
|
||||||
|
hasForcedCapture = False
|
||||||
|
|
||||||
for piece in self.getAllPieces(colour):
|
for piece in pieces:
|
||||||
|
validMoves = self.getValidMoves(piece)
|
||||||
|
|
||||||
|
# Check if there are forced capture moves for this piece
|
||||||
|
forcedCaptureMoves = [move for move, skip in validMoves.items() if skip]
|
||||||
|
if forcedCaptureMoves:
|
||||||
|
hasForcedCapture = True
|
||||||
|
possiblePieces.append(piece)
|
||||||
|
possibleMoves.append({move: skip for move, skip in validMoves.items() if skip})
|
||||||
|
|
||||||
|
if hasForcedCapture:
|
||||||
|
# If there are forced capture moves, consider only those
|
||||||
|
for i in range(len(possibleMoves)):
|
||||||
|
for move, skip in possibleMoves[i].items():
|
||||||
|
tempBoard = deepcopy(self)
|
||||||
|
tempPiece = tempBoard.getPiece(possiblePieces[i].row, possiblePieces[i].col)
|
||||||
|
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip)
|
||||||
|
moves.append(newBoard)
|
||||||
|
else:
|
||||||
|
# If no forced capture moves, consider all valid moves
|
||||||
|
for piece in pieces:
|
||||||
validMoves = self.getValidMoves(piece)
|
validMoves = self.getValidMoves(piece)
|
||||||
for move, skip in validMoves.items():
|
for move, skip in validMoves.items():
|
||||||
tempBoard = deepcopy(self)
|
tempBoard = deepcopy(self)
|
||||||
tempPiece = tempBoard.getPiece(piece.row, piece.col)
|
tempPiece = tempBoard.getPiece(piece.row, piece.col)
|
||||||
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip)
|
newBoard = self._simulateMove(tempPiece, move, tempBoard, skip)
|
||||||
moves.append(newBoard)
|
moves.append(newBoard)
|
||||||
|
|
||||||
return moves
|
return moves
|
||||||
|
|
||||||
def _simulateMove(self, piece, move, board, skip):
|
def _simulateMove(self, piece, move, board, skip):
|
||||||
@ -134,6 +159,7 @@ class Board:
|
|||||||
forcedCapture = forced
|
forcedCapture = forced
|
||||||
else:
|
else:
|
||||||
forcedCapture = forced
|
forcedCapture = forced
|
||||||
|
|
||||||
return forcedCapture
|
return forcedCapture
|
||||||
|
|
||||||
def scoreOfTheBoard(self):
|
def scoreOfTheBoard(self):
|
||||||
@ -241,7 +267,7 @@ class Board:
|
|||||||
def _decode(self, move):
|
def _decode(self, move):
|
||||||
# Split digits back out
|
# Split digits back out
|
||||||
str_code = str(move)
|
str_code = str(move)
|
||||||
print(str_code)
|
# print(str_code)
|
||||||
start_row = int(str_code[0])
|
start_row = int(str_code[0])
|
||||||
start_col = int(str_code[1])
|
start_col = int(str_code[1])
|
||||||
end_row = int(str_code[2])
|
end_row = int(str_code[2])
|
||||||
|
Loading…
Reference in New Issue
Block a user