Bug fixes to the RL algorithm and some tests
This commit is contained in:
@@ -37,9 +37,10 @@ class ReinforcementLearning():
|
||||
self.maxSize = 32
|
||||
self.epsilonMin = .01
|
||||
self.epsilonDecay = .995
|
||||
self.learningRate = 0.001
|
||||
self.learningRate = 0.0001
|
||||
self.memory = deque(maxlen=10000000)
|
||||
self.model = self._buildMainModel()
|
||||
self.model = self.buildMainModel()
|
||||
print(self.model.summary())
|
||||
|
||||
def AI(self, board: Board) -> tuple:
|
||||
"""
|
||||
@@ -48,7 +49,7 @@ class ReinforcementLearning():
|
||||
"""
|
||||
self.board = board
|
||||
self.state = self._convertState(self.board.board)
|
||||
self.actionSpace = self._encodeMoves(self.colour, self.board)
|
||||
self.actionSpace = self.encodeMoves(self.colour, self.board)
|
||||
if len(self.actionSpace) == 0:
|
||||
return self.score, None
|
||||
|
||||
@@ -61,7 +62,7 @@ class ReinforcementLearning():
|
||||
|
||||
return self.score, nextState
|
||||
|
||||
def _buildMainModel(self) -> Sequential:
|
||||
def buildMainModel(self) -> Sequential:
|
||||
"""
|
||||
Build the model for the AI
|
||||
:return: the model
|
||||
@@ -69,26 +70,24 @@ class ReinforcementLearning():
|
||||
# Board model
|
||||
modelLayers = [
|
||||
Lambda(lambda x: tf.reshape(x, [-1, 32])),
|
||||
Dense(256, activation='relu'),
|
||||
Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(128, activation='relu'),
|
||||
Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(64, activation='relu'),
|
||||
Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
|
||||
Dropout(0.2),
|
||||
Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01))
|
||||
Dense(16, activation='linear', kernel_regularizer=regularizers.l2(0.01))
|
||||
]
|
||||
boardModel = Sequential(modelLayers)
|
||||
|
||||
# boardModel.add(BatchNormalization())
|
||||
boardModel.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')
|
||||
boardModel.compile(optimizer=Adam(learning_rate=self.learningRate), loss='mean_squared_error')
|
||||
boardModel.build(input_shape=(None, None))
|
||||
|
||||
print(boardModel.summary())
|
||||
|
||||
return boardModel
|
||||
|
||||
def _replay(self) -> None:
|
||||
@@ -111,7 +110,7 @@ class ReinforcementLearning():
|
||||
# Encoded moves
|
||||
encodedMoves = []
|
||||
for state in states:
|
||||
encodedMoves.append(self._encodeMoves(self.colour, state))
|
||||
encodedMoves.append(self.encodeMoves(self.colour, state))
|
||||
|
||||
# Calculate targets
|
||||
targets = []
|
||||
@@ -126,7 +125,7 @@ class ReinforcementLearning():
|
||||
encodedMoves = np.array([np.pad(m, (0, self.maxSize - len(m)), 'constant', constant_values=(1, 1))
|
||||
for m in encodedMoves])
|
||||
targets = np.array(targets)
|
||||
self.model.fit(self._normalise(encodedMoves), self._normalise(targets), epochs=20)
|
||||
self.model.fit(self.normalise(encodedMoves), self.normalise(targets), epochs=20)
|
||||
if self.epsilon > self.epsilonMin:
|
||||
self.epsilon *= self.epsilonDecay
|
||||
|
||||
@@ -160,8 +159,10 @@ class ReinforcementLearning():
|
||||
return self.actionSpace[0]
|
||||
encodedMoves = np.squeeze(self.actionSpace)
|
||||
encodedMoves = np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1))
|
||||
act_values = self.model.predict(self._normalise(encodedMoves))
|
||||
return self.actionSpace[np.argmax(act_values[0])]
|
||||
act_values = self.model.predict(self.normalise(encodedMoves))
|
||||
val = np.argmax(act_values[0])
|
||||
val = val if val < len(self.actionSpace) else len(self.actionSpace) - 1
|
||||
return self.actionSpace[val]
|
||||
|
||||
def resetScore(self):
|
||||
self.score = 0
|
||||
@@ -209,20 +210,14 @@ class ReinforcementLearning():
|
||||
|
||||
def _maxNextQ(self) -> float:
|
||||
colour = WHITE if self.colour == GREEN else GREEN
|
||||
encodedMoves = self._encodeMoves(colour, self.board)
|
||||
encodedMoves = self.encodeMoves(colour, self.board)
|
||||
if len(encodedMoves) == 0:
|
||||
return -1
|
||||
paddedMoves = np.array(np.pad(encodedMoves, (0, self.maxSize - len(encodedMoves)), 'constant', constant_values=(1, 1)))
|
||||
# paddedMoves = np.reshape(paddedMoves, (32, 8, 8))
|
||||
# paddedMoves = paddedMoves / np.max(paddedMoved
|
||||
# paddedMoves = paddedMoves.reshape(32,)
|
||||
# pm = tf.convert_to_tensor(paddedMoves, dtype=tf.float32)
|
||||
# pm = tf.reshape(pm, [32])
|
||||
print(paddedMoves.shape)
|
||||
nextQValues = self.model.predict_on_batch(self._normalise(paddedMoves))
|
||||
nextQValues = self.model.predict_on_batch(self.normalise(paddedMoves))
|
||||
return np.max(nextQValues)
|
||||
|
||||
def _encodeMoves(self, colour: int, board: Board) -> list:
|
||||
def encodeMoves(self, colour: int, board: Board) -> list:
|
||||
"""
|
||||
Encodes the moves into a list encoded moves
|
||||
:param colour: colour of the player
|
||||
@@ -243,10 +238,8 @@ class ReinforcementLearning():
|
||||
diff = np.nonzero(diff)
|
||||
return diff
|
||||
|
||||
def _normalise(self, data):
|
||||
def normalise(self, data):
|
||||
"""
|
||||
Normalise the data
|
||||
"""
|
||||
for i in range(len(data)):
|
||||
data[i] = data[i] / np.linalg.norm(data[i])
|
||||
return data
|
||||
return data / 10000
|
||||
Reference in New Issue
Block a user