Created base game with working minimax algorithm, now working on reinforcement learning
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
import random
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.keras import Sequential, regularizers
|
||||
from tensorflow.python.keras.layers import Dense
|
||||
|
||||
|
||||
class ReinforcementLearning():
|
||||
|
||||
def __init__(self, action_space, state_space, env):
|
||||
self.action_space = action_space
|
||||
self.state_space = state_space
|
||||
self.env = env
|
||||
self.epsilon = 1
|
||||
self.gamma = .95
|
||||
self.batch_size = 64
|
||||
self.epsilon_min = .01
|
||||
self.epsilon_decay = .995
|
||||
self.learning_rate = 0.001
|
||||
self.memory = deque(maxlen=100000)
|
||||
self.model = self._buildModel()
|
||||
|
||||
def AI(self, episode):
|
||||
loss = []
|
||||
|
||||
max_steps = 1000
|
||||
|
||||
for e in range(episode):
|
||||
state = self.env.reset()
|
||||
state = np.reshape(state, (1, self.state_space))
|
||||
score = 0
|
||||
for i in range(max_steps):
|
||||
action = self.act(state)
|
||||
reward, next_state, done = self.env.step(action)
|
||||
score += reward
|
||||
next_state = np.reshape(next_state, (1, self.state_space))
|
||||
self.remember(state, action, reward, next_state, done)
|
||||
state = next_state
|
||||
self.replay()
|
||||
if done:
|
||||
print("episode: {}/{}, score: {}".format(e, episode, score))
|
||||
break
|
||||
loss.append(score)
|
||||
|
||||
def _buildModel(self):
|
||||
# Board model
|
||||
board_model = Sequential()
|
||||
|
||||
# input dimensions is 32 board position values
|
||||
board_model.add(Dense(64, activation='relu', input_dim=32))
|
||||
|
||||
# use regularizers, to prevent fitting noisy labels
|
||||
board_model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
|
||||
board_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01))) # 16
|
||||
board_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01))) # 8
|
||||
|
||||
# output isn't squashed, because it might lose information
|
||||
board_model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01)))
|
||||
board_model.compile(optimizer='nadam', loss='binary_crossentropy')
|
||||
|
||||
return board_model
|
||||
|
||||
def remember(self, state, action, reward, next_state, done):
|
||||
self.memory.append((state, action, reward, next_state, done))
|
||||
|
||||
def replay(self):
|
||||
if len(self.memory) < self.batch_size:
|
||||
return
|
||||
|
||||
minibatch = random.sample(self.memory, self.batch_size)
|
||||
states = np.array([i[0] for i in minibatch])
|
||||
actions = np.array([i[1] for i in minibatch])
|
||||
rewards = np.array([i[2] for i in minibatch])
|
||||
next_states = np.array([i[3] for i in minibatch])
|
||||
dones = np.array([i[4] for i in minibatch])
|
||||
|
||||
states = np.squeeze(states)
|
||||
next_states = np.squeeze(next_states)
|
||||
|
||||
targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1)) * (1 - dones)
|
||||
targets_full = self.model.predict_on_batch(states)
|
||||
|
||||
ind = np.array([i for i in range(self.batch_size)])
|
||||
targets_full[[ind], [actions]] = targets
|
||||
|
||||
self.model.fit(states, targets_full, epochs=1, verbose=0)
|
||||
if self.epsilon > self.epsilon_min:
|
||||
self.epsilon *= self.epsilon_decay
|
||||
|
||||
def act(self, state):
|
||||
if np.random.rand() <= self.epsilon:
|
||||
return random.randrange(self.action_space)
|
||||
act_values = self.model.predict(state)
|
||||
return np.argmax(act_values[0])
|
||||
Reference in New Issue
Block a user