diff --git a/MEOT/DQN/DQL.py b/MEOT/DQN/DQL.py new file mode 100644 index 0000000..3e229bd --- /dev/null +++ b/MEOT/DQN/DQL.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 16 10:50:17 2018 + +@author: Louis +""" + +import os + +os.environ['SDL_VIDEODRIVER'] = 'dummy' +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy +from testG import test_model_G + +import matplotlib.pyplot as plt +from skimage.color import rgb2gray +from skimage.transform import resize +from skimage.exposure import rescale_intensity + +from keras.models import Sequential, load_model +from keras.layers import Dense, Conv2D, Flatten +import graphviz + + +from collections import deque + +def process_screen(x): + + return (255 * resize(rgb2gray(x)[50:, :410], (84, 84))).astype("uint8") + + +#%% Network Definition +dqn = Sequential() +#1st layer +dqn.add(Conv2D(filters=16, kernel_size=(8,8), strides=4, activation="relu", input_shape=(84,84,4))) +#2nd layer +dqn.add(Conv2D(filters=32, kernel_size=(4,4), strides=2, activation="relu")) +dqn.add(Flatten()) +#3rd layer +dqn.add(Dense(units=256, activation="relu")) +#output layer +dqn.add(Dense(units=2, activation="linear")) + +dqn.compile(optimizer="rmsprop", loss="mean_squared_error") + +#%% Training Fonctions + +def epsilon(step): + ##Linear decay until step 200 000 then constant + if step<200000: + return 1-step*(0.09/200000) + return .01 + +def clip_reward(r): + ## Shaping Reward : -1, 0.1, 1 + if (r==0): + return 0.1 + if (r<0): + return -1 + return r + +def greedy_action(network, x): + Q = network.predict(np.array([x])) + return np.argmax(Q) + + +#%% Memory_buffer +# A class for the replay memory + + +class MemoryBuffer: + "An experience replay buffer using numpy arrays" + def __init__(self, length, screen_shape, action_shape): + self.length = length + self.screen_shape = screen_shape + self.action_shape = action_shape + shape = (length,) + screen_shape + self.screens_x = np.zeros(shape, dtype=np.uint8) # starting states + self.screens_y = np.zeros(shape, dtype=np.uint8) # resulting states + shape = (length,) + action_shape + self.actions = np.zeros(shape, dtype=np.uint8) # actions + self.rewards = np.zeros((length,1), dtype=np.int8) # rewards + self.terminals = np.zeros((length,1), dtype=np.bool) # true if resulting state is terminal + self.terminals[-1] = True + self.index = 0 # points one position past the last inserted element + self.size = 0 # current size of the buffer + + def append(self, screenx, a, r, screeny, d): + self.screens_x[self.index] = screenx + #plt.imshow(screenx) + #plt.show() + #plt.imshow(self.screens_x[self.index]) + #plt.show() + self.actions[self.index] = a + self.rewards[self.index] = r + self.screens_y[self.index] = screeny + self.terminals[self.index] = d + self.index = (self.index+1) % self.length + self.size = np.min([self.size+1,self.length]) + + def stacked_frames_x(self, index): + im_deque = deque(maxlen=4) + pos = index % self.length + for i in range(4): # todo + im = self.screens_x[pos] + im_deque.appendleft(im) + test_pos = (pos-1) % self.length + if self.terminals[test_pos] == False: + pos = test_pos + return np.stack(im_deque, axis=-1) + + def stacked_frames_y(self, index): + im_deque = deque(maxlen=4) + pos = index % self.length + for i in range(4): # todo + im = self.screens_y[pos] + im_deque.appendleft(im) + test_pos = (pos-1) % self.length + if self.terminals[test_pos] == False: + pos = test_pos + return np.stack(im_deque, axis=-1) + + def minibatch(self, size): + #return np.random.choice(self.data[:self.size], size=sz, replace=False) + indices = np.random.choice(self.size, size=size, replace=False) + x = np.zeros((size,)+self.screen_shape+(4,)) + y = np.zeros((size,)+self.screen_shape+(4,)) + + for i in range(size): + x[i] = self.stacked_frames_x(indices[i]) + y[i] = self.stacked_frames_y(indices[i]) + return x, self.actions[indices], self.rewards[indices], y, self.terminals[indices] + + +#%% Training Episode +# initialize state and replay memory +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen='store_false') +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() + +total_steps = 800000 +replay_memory_size = 100000 +intermediate_size = 50000 +interval_test = 25000 +mini_batch_size = 32 +gamma = 0.99 + +average_score = [] +max_score= [] + + +p.reset_game() +screen_x = process_screen(p.getScreenRGB()) +stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) +x = np.stack(stacked_x, axis=-1) +replay_memory = MemoryBuffer(replay_memory_size, (84,84), (1,)) +# initial state for evaluation +evaluation_period = 30 +Xtest = np.array([x]) +nb_epochs = total_steps // evaluation_period +epoch=-1 +scoreQ = np.zeros((nb_epochs)) +scoreMC = np.zeros((nb_epochs)) +list_actions = [0,119] + + +# Deep Q-learning with experience replay +for step in range(total_steps): + + if (step%intermediate_size==0): + dqn.save('TrainG5_'+str(int(step/intermediate_size))+'.h5') + print('Sauvegarde du modèle : Step = ' + str(step)) + + if (step%interval_test==0): + avg_temp = 0 + max_temp = 0 + print('Eval Period : '+str(step)) + avg_temp, max_temp = test_model_G(evaluation_period, dqn) + average_score.append(avg_temp) + max_score.append(max_temp) + + # evaluation +# if(step%10 == 0): +# epoch = epoch+1 +# # evaluation of initial state +# scoreQ[epoch] = np.mean(dqn.predict(Xtest).max(1)) +# # roll-out evaluation +# scoreMC[epoch] = MCeval(network=dqn, trials=20, length=700, gamma=gamma) + # action selection + + if np.random.rand() < epsilon(step): + if np.random.randint(0,5)==1: + a = 0 + else : + a = 1 + else: + a = greedy_action(dqn, x) + # step + + r=p.act(list_actions[a]) + raw_screen_y = p.getScreenRGB() + + r = clip_reward(r) + d=p.game_over() + + screen_y = process_screen(raw_screen_y) + replay_memory.append(screen_x, a, r, screen_y, d) + + # train + if step>step+mini_batch_size: + X,A,R,Y,D = replay_memory.minibatch(mini_batch_size) + QY = dqn.predict(Y) + QYmax = QY.max(1).reshape((mini_batch_size,1)) + update = R + gamma * (1-D) * QYmax + QX = dqn.predict(X) + QX[np.arange(mini_batch_size), A.ravel()] = update.ravel() + dqn.train_on_batch(x=X, y=QX) + + # prepare next transition + if d==True: + # restart episode + p.reset_game() + screen_x = process_screen(p.getScreenRGB()) + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + else: + + # keep going + screen_x = screen_y + stacked_x.append(screen_x) + x = np.stack(stacked_x, axis=-1) + + +dqn.save('TrainG5_max.h5') + +np.savetxt('average.txt',average_score, delimiter=',') +np.savetxt('max.txt',max_score, delimiter=',') \ No newline at end of file diff --git a/MEOT/DQN/FlappyAgent.py b/MEOT/DQN/FlappyAgent.py new file mode 100644 index 0000000..f97021a --- /dev/null +++ b/MEOT/DQN/FlappyAgent.py @@ -0,0 +1,26 @@ +import numpy as np + +import matplotlib.pyplot as plt +from skimage.color import rgb2gray +from skimage.transform import resize +from skimage.exposure import rescale_intensity + +from keras.models import Sequential, load_model +from keras.layers import Dense, Conv2D, Flatten +import graphviz + +from collections import deque + +list_actions = [0,119] +dqn = load_model('TrainG4_max.h5') +def process_screen(x): + return (255 * resize(rgb2gray(x)[50:, :410], (84, 84))).astype("uint8") + +def FlappyPolicy(state, screen): + screen_x = process_screen(screen) + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + action = list_actions[np.argmax(dqn.predict(np.expand_dims(x,axis=0)))] + return action + + diff --git a/MEOT/DQN/TrainG4_max.h5 b/MEOT/DQN/TrainG4_max.h5 new file mode 100644 index 0000000..1848bfb Binary files /dev/null and b/MEOT/DQN/TrainG4_max.h5 differ diff --git a/MEOT/DQN/run.py b/MEOT/DQN/run.py new file mode 100644 index 0000000..79770e1 --- /dev/null +++ b/MEOT/DQN/run.py @@ -0,0 +1,39 @@ +# You're not allowed to change this file +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() +reward = 0.0 + +nb_games = 100 +cumulated = np.zeros((nb_games)) + + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + screen = p.getScreenRGB() + + + action=FlappyPolicy(state, screen) ### Your job is to define this function. + + + + reward = p.act(action) + print(reward) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) + + +#####---------- + diff --git a/MEOT/DQN/testG.py b/MEOT/DQN/testG.py new file mode 100644 index 0000000..9237ae1 --- /dev/null +++ b/MEOT/DQN/testG.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 16 22:13:01 2018 + +@author: Louis + +""" + +# Functions used to test during Gcloud training phase. +import os + +os.environ['SDL_VIDEODRIVER'] = 'dummy' +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +import matplotlib.pyplot as plt +from skimage.color import rgb2gray +from skimage.transform import resize +from skimage.exposure import rescale_intensity + +from keras.models import Sequential, load_model +from keras.layers import Dense, Conv2D, Flatten +import graphviz + +from collections import deque + +def process_screen(x): + return (255 * resize(rgb2gray(x)[50:, :410], (84, 84))).astype("uint8") + +def test_model_G(nb_games, model): + game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. + p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) + p.init() + reward = 0.0 + + cumulated = np.zeros((nb_games)) + list_actions = [0,119] + + for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + + screen_x = process_screen(p.getScreenRGB()) + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + action = list_actions[np.argmax(model.predict(np.expand_dims(x,axis=0)))] + + reward = p.act(action) + + cumulated[i] = cumulated[i] + reward + + avg_score = np.mean(cumulated) + print('Average : '+ str(avg_score)) + mx_score = np.max(cumulated) + print('Max : '+ str(mx_score)) + return avg_score, mx_score diff --git a/MEOT/DQN/testres.py b/MEOT/DQN/testres.py new file mode 100644 index 0000000..916fa5d --- /dev/null +++ b/MEOT/DQN/testres.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 9 14:48:23 2018 + +@author: Louis + +""" + +#Local test sequence + +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +import matplotlib.pyplot as plt +from skimage.color import rgb2gray +from skimage.transform import resize +from skimage.exposure import rescale_intensity + +from keras.models import Sequential, load_model +from keras.layers import Dense, Conv2D, Flatten +import graphviz + +from collections import deque + +def process_screen(x): + return (255 * resize(rgb2gray(x)[50:, :410], (84, 84))).astype("uint8") + + +def greedy_action(network, x): + Q = network.predict(np.array([x])) + return np.argmax(Q) + + + #%% +dqn=load_model('TrainG4_19.h5') +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) +p.init() +reward = 0.0 +list_actions=[0,119] +nb_games = 100 +cumulated = np.zeros((nb_games)) + + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + + screen_x = process_screen(p.getScreenRGB()) + stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) + x = np.stack(stacked_x, axis=-1) + action = list_actions[greedy_action(dqn,x)] + + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) \ No newline at end of file diff --git a/MEOT/QL/FlappyAgent.py b/MEOT/QL/FlappyAgent.py new file mode 100644 index 0000000..ac9e6d0 --- /dev/null +++ b/MEOT/QL/FlappyAgent.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +""" +Created on Wed Jan 24 14:55:41 2018 + +@author: Louis MEOT +""" +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from random import randint +import math +import pickle + +game=FlappyBird() +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) + +p.init() + +nb_games = 20 +cumulated = np.zeros((nb_games)) +f_myfile = open('Q_function600.pickle', 'rb') +Q_function = pickle.load(f_myfile) # variables come out in the order you put them in +f_myfile.close() + +def FlappyPolicy(state, screen): + a= play_loop(state) + return a + + +# Maillage des états +def observeState(state): + y_to_pipe_bottom = state["player_y"] - state["next_pipe_bottom_y"] + y_cat = 0 + x_cat = 0 + h_max = 412 + h_min = -412 + d_max = 288 + nb_y_cat = 14 + nb_x_cat = 5 + + while(y_to_pipe_bottom - h_min > (h_max - h_min) * y_cat/nb_y_cat): + y_cat += 1 + + while(state["next_pipe_dist_to_player"] > d_max * x_cat/nb_x_cat): + x_cat += 1 + + speed_cat = int((state["player_vel"]+16)/2) + + return (x_cat-1,y_cat-1,speed_cat) + + +def epsilon_greedy(Q, s): + a = np.argmax(Q[s[0]][s[1]][s[2]][:]) # Action optimale avec une proba 1-eps + return a + +def play_loop(state): + ps = observeState(state) + action_ind = epsilon_greedy(Q_function,ps) + if (action_ind==1): + action = 119 + else: + action = None + return action + + \ No newline at end of file diff --git a/MEOT/QL/FlappyTraining.py b/MEOT/QL/FlappyTraining.py new file mode 100644 index 0000000..a9d279c --- /dev/null +++ b/MEOT/QL/FlappyTraining.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 24 14:55:41 2018 + +@author: Louis MEOT +""" +import os + +os.environ['SDL_VIDEODRIVER'] = 'dummy' + +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from random import randint +import math +import pickle + +def FlappyPolicyDyn(state,screen): + print(state) + if(state["player_y"]>state["next_pipe_bottom_y"]-50): + return 119 + else: + return None + +def FlappyPolicy(state, screen,game,p,epsilon,cummulated,i,count,Q, STATES, nb_states): + a= play_loop(state,Q,game,p,epsilon,cumulated,i,count, STATES, nb_states) + return a + +# Maillage des états +def observeState(state,p): + y_to_pipe_bottom = state["player_y"] - state["next_pipe_bottom_y"] + y_cat = 0 + x_cat = 0 + h_max = 412 + h_min = -412 + d_max = 288 + nb_y_cat = 14 + nb_x_cat = 5 + + while(y_to_pipe_bottom - h_min > (h_max - h_min) * y_cat/nb_y_cat): + y_cat += 1 + + while(state["next_pipe_dist_to_player"] > d_max * x_cat/nb_x_cat): + x_cat += 1 + + speed_cat = int((state["player_vel"]+16)/2) + + return (x_cat-1,y_cat-1,speed_cat) + +def rewardAndUpdateQ(STATES, nb_states, Q): + alpha = 0.4 + gamma = 0.9 + + for i in range(nb_states-1): + s = STATES[i] + ns = STATES[i+1] + if (i -
  • fork the project at [https://github.com/SupaeroDataScience/RLchallenge](https://github.com/SupaeroDataScience/RLchallenge) on your own github (yes, you'll need one). -
  • move the only file there ('run.py') under a directory "YourLastName". -
  • create 'FlappyPolicy.py' in order to implement the function `FlappyPolicy(state,screen)` used below. You're free to add as many extra files as you need. However, you're not allowed to change 'run.py'. -
  • add any useful material (comments, text files, analysis, etc.) -
  • make a pull request on the original repository when you're done. - - -`FlappyPolicy(state,screen)` takes both the game state and the screen as input. It gives you the choice of what you base your policy on: - - -# Installation - -You will need to install a few things to get started. -First, you will need PyGame. - -``` -pip install pygame -``` - -And you will need [PLE (PyGame Learning Environment)](https://github.com/ntasfi/PyGame-Learning-Environment). -``` -git clone https://github.com/ntasfi/PyGame-Learning-Environment.git -cd PyGame-Learning-Environment/ -pip install -e . -``` +# Presentation + +This repository is the result of a school Reinforcment Learning Challenge aiming to train an policy for FlappyBird atari game. +Two methods have been deployed to do so : + +# Computation constraints : +All calculations and training phases have pushed to Gcloud platform with a free trial account : (8vCPUS, 30GB Memory) +To do so, the line allows the game not to display any window during the training phase (which would generate an error since Gcloud Virtual Machines come without UI) : + +os.environ['SDL_VIDEODRIVER'] = 'dummy' + +# QLearning +Largely inspired from previous Lessons & Fellow Students results, the main idea of this QLearning is to narrowly **crop the state-space** and to punish with a large **negative** reward when flappy crashes. + +# DQL +Deep Q Learning is the first solution I tried to implement (then switched to QLearning in order to present some results). +The original frame is croped to the floor and just behind the back of flappy, resized to a 84x84 grasycale window then stacked with the 3 previous states so the network as the ability to get differential information (velocity). +## Training Period : +300 000 steps +Network saved after 50 000 steps +Policy evaluated after 25 000 steps + +Network has been trained over 300 000 steps so far. +Results and evolution tend to make me think that a longer training period would hopefully largely improve results. + +