diff --git a/__pycache__/config.cpython-37.pyc b/__pycache__/config.cpython-37.pyc deleted file mode 100644 index e8d1d36..0000000 Binary files a/__pycache__/config.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/config.cpython-38.pyc b/__pycache__/config.cpython-38.pyc index 8ec24f7..f73b463 100644 Binary files a/__pycache__/config.cpython-38.pyc and b/__pycache__/config.cpython-38.pyc differ diff --git a/neural_network.py b/neural_network.py index 6ccaff7..d420b0f 100644 --- a/neural_network.py +++ b/neural_network.py @@ -1,121 +1,138 @@ import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers +import keras import gym from matplotlib import pyplot as plt from config import infos -import pickle from collections import deque import numpy as np import random -from tqdm import tqdm +import copy +import sys +import argparse + env = gym.envs.make("CartPole-v1") state_size = env.observation_space.shape[0] action_size = env.action_space.n +output_dir = "./weights" +memory = deque(maxlen=2000) -class DQNAgent: - - def __init__(self, state_size, action_size): - self.state_size = state_size - self.action_size = action_size - self.memory = deque(maxlen=infos.len_memory) - self.epsilon = infos.epsilon - self.m1 = self.kreate_model() - #self.m2 = self.kreate_model() - #self.m2.set_weights(self.m1.get_weights()) - - - def kreate_model(self): - learning_rate = infos.learning_rate - model = keras.Sequential() - model.add(keras.layers.Dense(8, input_shape=[self.state_size], activation='relu')) - model.add(keras.layers.Dense(16, activation='relu')) - model.add(keras.layers.Dense(32, activation='relu')) - model.add(keras.layers.Dense(64, activation='relu')) - model.add(keras.layers.Dense(self.action_size, activation='linear')) - model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate)) - return model +def init_model(state_size, action_size): + learning_rate = infos.learning_rate + model = keras.Sequential() + model.add(keras.layers.Dense(8, input_shape=[state_size], activation='relu')) + model.add(keras.layers.Dense(16, activation='relu')) + model.add(keras.layers.Dense(32, activation='relu')) + # model.add(keras.layers.Dense(64, activation='relu')) + model.add(keras.layers.Dense(action_size, activation='linear')) + model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate)) + return model - def acting(self, state): - if (random.random() < self.epsilon): - return random.randrange(self.action_size) - action = np.argmax(self.m1.predict(state)[0]) ### change to m2 - return (action) - - def fitting(self, state, action, reward, new_state, target_qvalues): - target = reward + (infos.discount_factor * max(self.m1.predict(new_state)[0])) ### change to m2 - target_qvalues[0][action] = target - self.m1.fit(state, target_qvalues, verbose = 0) - - def evaluate(self): - results = [] - for episode in range(infos.eval_size): - state = env.reset() - state = np.reshape(state, [1, state_size]) - steps = 0 - done = False - - while not done and steps < infos.replay_memory: - predicted_qvalues = self.m1.predict(state) - action = np.argmax(predicted_qvalues[0]) - state, _, done, _ = env.step(action) - state = np.reshape(state, [1, state_size]) - steps += 1 - results.append(steps) - return np.mean(results) - - def update_epsilon(self): - if self.epsilon > infos.epislon_min: - self.epsilon = self.epsilon * infos.epsilon_decay - if (self.epsilon <= infos.epislon_min): - self.epsilon = infos.epislon_min +def policy(state, predicted_qvalues, epsilon): + if (random.random() < epsilon): + action = random.randint(0, 1) + else: + action = np.argmax(predicted_qvalues) + return (action) def load(name, model): - model.load_weights(name) - + model.load_weights(name) + def save(name, model): - model.save_weights(name) + model.save_weights(name) + +def fit_model(state, action, reward, new_state, m1, m2, target_qvalues): + target = reward + (infos.discount_factor * max(m2.predict(new_state)[0])) + target_qvalues[0][action] = target + m1.fit(state, target_qvalues, verbose = 0) + return m1, m2 + +def copy_model(model): + model_copy = keras.models.clone_model(model) + model_copy.build((None, action_size)) + model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=infos.learning_rate)) + model_copy.set_weights(model.get_weights()) + return (model_copy) + +def eval(m1): + results = [] + for episode in range(3): + state = env.reset() + state = np.reshape(state, [1, state_size]) + steps = 0 + done = False + + while not done and steps < 200: + predicted_qvalues = m1.predict(state) + action = np.argmax(predicted_qvalues[0]) + state, _, done, _ = env.step(action) + state = np.reshape(state, [1, state_size]) + steps += 1 + results.append(steps) + return np.mean(results) + +def play(m1): + while True: + state = env.reset() + state = np.reshape(state, [1, state_size]) + done = False + while not done: + action = np.argmax(m1.predict(state)[0]) # action = random.randint(0, 1) + new_state, reward, done, _ = env.step(action) + env.render() + state = new_state + env.close() def learn(): - agent = DQNAgent(state_size, action_size) - for episode in tqdm(range(infos.episodes)): - state = env.reset() - state = np.reshape(state, [1, state_size]) - steps = 0 - done = False - - while not done: - predicted_qvalues = agent.m1.predict(state) ### change to m2 - action = agent.acting(state) - new_state, reward, done, _ = env.step(action) - new_state = np.reshape(new_state, [1, state_size]) - steps += 1 - if done == True: - reward = infos.reward_values[0] - agent.memory.append((state, action, reward, new_state, done)) - agent.fitting(state, action, reward, new_state, predicted_qvalues) - state = new_state - - if len(agent.memory) > infos.replay_memory and (random.random() < 0.5): - minibatch = random.sample(agent.memory, infos.replay_memory) - for state, action, reward, new_state, done in minibatch: - predicted_qvalues = agent.m1.predict(state) ### change to m2 - action = agent.acting(state) - agent.fitting(state, action, reward, new_state, predicted_qvalues) + epsilon = infos.epsilon + m1 = init_model(state_size, action_size) + m2 = init_model(state_size, action_size) + m2.set_weights(m1.get_weights()) + for episode in range(infos.episodes): + state = env.reset() + state = np.reshape(state, [1, state_size]) + steps = 0 + done = False + + while not done: + predicted_qvalues = m2.predict(state) + action = policy(state, predicted_qvalues[0], epsilon) + new_state, reward, done, _ = env.step(action) + new_state = np.reshape(new_state, [1, state_size]) + steps += 1 + if done == True: + reward = infos.reward_values[0] + memory.append((state, action, reward, new_state, done)) + m1, m2 = fit_model(state, action, reward, new_state, m1, m2, predicted_qvalues) + state = new_state + + if len(memory) > 200 and (random.random() < 0.5): + print(f"*** memory replay for episode:{episode}") + minibatch = random.sample(memory, infos.batch_size) + ### check minibatch + for state, action, reward, new_state, done in minibatch: + predicted_qvalues = m1.predict(state) + action = policy(state, predicted_qvalues[0], epsilon) + m1, m2 = fit_model(state, action, reward, new_state, m1, m2, predicted_qvalues) - agent.update_epsilon() - # print(f'\nepisode = {episode}, total_steps = {steps} and epsilon == {round(epsilon, 3)}') - # if episode % 10 == 0 and episode != 0: - # print(f"evaluation m1 = {agent.evaluate()}") - # agent.m1.set_weights(agent.m1.get_weights()) ### change to m2 + epsilon = epsilon * infos.epsilon_decay + if (epsilon < infos.epislon_min): + epsilon = infos.epislon_min + + print(f'\nepisode = {episode}, total_steps = {steps} and epsilon == {round(epsilon, 3)}') + if episode % 10 == 0 and episode != 0: + print(f"evaluation m1 = {eval(m1)}") + m2.set_weights(m1.get_weights()) - if episode % 50 == 0 and episode != 0: - agent.m1.save_weights(f'weigths/with_dqn_{episode}.hdf5', agent.m1) - - return agent.m1 + if episode % 50 == 0 and episode != 0: + save(f'outs/with_dqn_{episode}.hdf5', m1) + return m1 if __name__ == "__main__": - m1 = learn() \ No newline at end of file + m1 = learn() + episode = sys.argv[1] + m1 = init_model(state_size, action_size) + m1.load_weights("weigths/weights_dqn_550e") + play(m1) \ No newline at end of file diff --git a/neural_network2.py b/neural_network2.py deleted file mode 100644 index 274aa8a..0000000 --- a/neural_network2.py +++ /dev/null @@ -1,143 +0,0 @@ -# import tensorflow as tf -# from tensorflow import keras -# from tensorflow.keras import layers - -import gym -from matplotlib import pyplot as plt -from config import infos -import pickle -from collections import deque -import numpy as np -import random -import copy -import sys - - -env = gym.envs.make("CartPole-v1") -state_size = env.observation_space.shape[0] -action_size = env.action_space.n -print(f"state_size = {state_size}, action size = {action_size}") -output_dir = "./cartpole/outs" -memory = deque(maxlen=2000) - -def init_model(state_size, action_size): - learning_rate = infos.learning_rate - model = keras.Sequential() - model.add(keras.layers.Dense(8, input_shape=[state_size], activation='relu')) - model.add(keras.layers.Dense(16, activation='relu')) - model.add(keras.layers.Dense(32, activation='relu')) - #model.add(keras.layers.Dense(64, activation='relu')) - model.add(keras.layers.Dense(action_size, activation='linear')) - model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate)) - return model - -def policy(state, predicted_qvalues, epsilon): - if (random.random() < epsilon): - action = random.randint(0, 1) - else: - action = np.argmax(predicted_qvalues) - return (action) - -def load(name, model): - model.load_weights(name) - -def save(name, model): - model.save_weights(name) - -def fit_model(state, action, reward, new_state, m1, m2, target_qvalues): - target = reward + (infos.discount_factor * max(m2.predict(new_state)[0])) - target_qvalues[0][action] = target - #is .fit() good ? - # print(f"state = {state}, action = {action}, reward = {reward}, target_qvalues = {target_qvalues}") - m1.fit(state, target_qvalues, verbose = 0) - return m1, m2 - -def copy_model(model): - model_copy = keras.models.clone_model(model) - model_copy.build((None, action_size)) # replace 10 with number of variables in input layer - model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=infos.learning_rate)) - model_copy.set_weights(model.get_weights()) - return (model_copy) - -def eval(m1): - results = [] - for episode in range(3): - state = env.reset() - state = np.reshape(state, [1, state_size]) - steps = 0 - done = False - - while not done and steps < 200: - predicted_qvalues = m1.predict(state) - action = np.argmax(predicted_qvalues[0]) - state, _, done, _ = env.step(action) - state = np.reshape(state, [1, state_size]) - steps += 1 - results.append(steps) - return np.mean(results) - -def play(m1): - while True: - state = env.reset() - state = np.reshape(state, [1, state_size]) - done = False - while not done: - action = random.randint(0, 1)#np.argmax(m1.predict(state)[0]) - new_state, reward, done, _ = env.step(action) - env.render() - state = new_state - env.close() - - -def learn(): - epsilon = infos.epsilon - m1 = init_model(state_size, action_size) - m2 = init_model(state_size, action_size) - m2.set_weights(m1.get_weights()) - for episode in range(infos.episodes): - state = env.reset() - state = np.reshape(state, [1, state_size]) - steps = 0 - done = False - - while not done: - predicted_qvalues = m2.predict(state) - action = policy(state, predicted_qvalues[0], epsilon) - new_state, reward, done, _ = env.step(action) - new_state = np.reshape(new_state, [1, state_size]) - steps += 1 - if done == True: - reward = infos.reward_values[0] - memory.append((state, action, reward, new_state, done)) - m1, m2 = fit_model(state, action, reward, new_state, m1, m2, predicted_qvalues) - state = new_state - - if len(memory) > 200 and (random.random() < 0.5): - print(f"*** memory replay for episode:{episode}") - minibatch = random.sample(memory, infos.batch_size) - ### check minibatch - for state, action, reward, new_state, done in minibatch: - predicted_qvalues = m1.predict(state) - action = policy(state, predicted_qvalues[0], epsilon) - m1, m2 = fit_model(state, action, reward, new_state, m1, m2, predicted_qvalues) - - epsilon = epsilon * infos.epsilon_decay - if (epsilon < infos.epislon_min): - epsilon = infos.epislon_min - - print(f'\nepisode = {episode}, total_steps = {steps} and epsilon == {round(epsilon, 3)}') - if episode % 10 == 0 and episode != 0: - print(f"evaluation m1 = {eval(m1)}") - m2.set_weights(m1.get_weights()) - - if episode % 50 == 0 and episode != 0: - save(f'outs/with_dqn_{episode}.hdf5', m1) - return m1 - -if __name__ == "__main__": - #m1 = learn() - episode = sys.argv[1] - m1 = None - # m1 = init_model(state_size, action_size) - # m1.load_weights(f'outs/with_dqn_{episode}.hdf5') - play(m1) \ No newline at end of file diff --git a/q_table_bis.pkl b/q_table_bis.pkl deleted file mode 100644 index 2f0fe38..0000000 Binary files a/q_table_bis.pkl and /dev/null differ diff --git a/tabledeq.py b/tabledeq.py index f08fe66..ddc8c81 100644 --- a/tabledeq.py +++ b/tabledeq.py @@ -4,6 +4,16 @@ import numpy as np from config import infos import pickle +import sys +import argparse + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('-l', '--load', help='Choose if you want to load an existing qtable', action='store_true') + parser.add_argument('-e', '--epochs', help='Choose the number of epochs you need for your training', type=int, default=200) + args = parser.parse_args() + return (args) + def render_graph(avg_steps, avg_episodes, goal): if infos.graph == 1: @@ -89,7 +99,7 @@ def learn(): print(f'episode = {episode}, avg_steps = {round(avg_steps[-1], 3)} and epsilon == {round(infos.epsilon, 3)}') if avg_steps[-1] > 400: - with open("q_table_bis.pkl", "wb+") as f: + with open(f"q_table_bis.pkl", "wb+") as f: pickle.dump(Q_table, f) return @@ -115,11 +125,15 @@ def play(): state = new_state env.close() -def read_cutie(): - with open("q_table.pkl", "rb") as f: +def read_cutie(file): + with open(file, "rb") as f: return pickle.load(f) + if __name__ == "__main__": - # learn() - Q_table = read_cutie() + args = parse_arguments() + if args.load == False: + learn() + elif args.load == True: + Q_table = read_cutie("q_table.pkl") play() \ No newline at end of file diff --git a/weigths/with_dqn_550.hdf5 b/weigths/weights_dqn_550e similarity index 100% rename from weigths/with_dqn_550.hdf5 rename to weigths/weights_dqn_550e diff --git a/weigths/with_dqn_100.hdf5 b/weigths/with_dqn_100.hdf5 deleted file mode 100644 index c6bd47a..0000000 Binary files a/weigths/with_dqn_100.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_150.hdf5 b/weigths/with_dqn_150.hdf5 deleted file mode 100644 index 25a2dc1..0000000 Binary files a/weigths/with_dqn_150.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_200.hdf5 b/weigths/with_dqn_200.hdf5 deleted file mode 100644 index f9fd7eb..0000000 Binary files a/weigths/with_dqn_200.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_250.hdf5 b/weigths/with_dqn_250.hdf5 deleted file mode 100644 index 63c48be..0000000 Binary files a/weigths/with_dqn_250.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_300.hdf5 b/weigths/with_dqn_300.hdf5 deleted file mode 100644 index dc5edf3..0000000 Binary files a/weigths/with_dqn_300.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_350.hdf5 b/weigths/with_dqn_350.hdf5 deleted file mode 100644 index b77b2de..0000000 Binary files a/weigths/with_dqn_350.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_400.hdf5 b/weigths/with_dqn_400.hdf5 deleted file mode 100644 index 0b6cf16..0000000 Binary files a/weigths/with_dqn_400.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_450.hdf5 b/weigths/with_dqn_450.hdf5 deleted file mode 100644 index e005587..0000000 Binary files a/weigths/with_dqn_450.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_50.hdf5 b/weigths/with_dqn_50.hdf5 deleted file mode 100644 index b97b8ef..0000000 Binary files a/weigths/with_dqn_50.hdf5 and /dev/null differ diff --git a/weigths/with_dqn_500.hdf5 b/weigths/with_dqn_500.hdf5 deleted file mode 100644 index a2d4af4..0000000 Binary files a/weigths/with_dqn_500.hdf5 and /dev/null differ