diff --git a/Meliani/FlappyAgent.py b/Meliani/FlappyAgent.py new file mode 100644 index 0000000..2d5d797 --- /dev/null +++ b/Meliani/FlappyAgent.py @@ -0,0 +1,12 @@ +import numpy as np +from keras.models import Sequential, load_model +model = load_model("best_model.dqf") +def FlappyPolicy(state, screen): + q = model.predict(np.array(list(state.values())).reshape(1,len(state))) +# q = self.model.predict(screen.reshape(1, screen.shape[0], screen.shape[1], screen.shape[2])) +# print(q) + + return(np.argmax(q)*119) +# return np.random.randint(0,1)*119 + + diff --git a/Meliani/best_model.dqf b/Meliani/best_model.dqf new file mode 100644 index 0000000..022ac94 Binary files /dev/null and b/Meliani/best_model.dqf differ diff --git a/Meliani/q_learn_state.py b/Meliani/q_learn_state.py new file mode 100644 index 0000000..ec4bb27 --- /dev/null +++ b/Meliani/q_learn_state.py @@ -0,0 +1,95 @@ +import numpy as np +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from keras import optimizers +from keras.models import Sequential, load_model +from keras.layers.core import Dense, Dropout, Activation +from keras.optimizers import RMSprop, sgd +from keras.layers.recurrent import LSTM +import numpy as np +import random +import sys +from sklearn.preprocessing import StandardScaler as scl +import time +model = Sequential() + +file_path = "test_part_" + + +model.add(Dense(512, init='lecun_uniform', input_shape=(8,))) + +model.add(Activation('relu')) +model.add(Dense(2, init='lecun_uniform')) +model.add(Activation('linear')) +model.compile(loss='mse', optimizer=optimizers.Adam(lr=1e-4)) + +gamma = 0.99 # discount factor +epsilon = 1 # epsilon-greddy +batchSize = 256 # mini batch size + +jeu = FlappyBird() +p= PLE(jeu, fps=30, frame_skip=1, num_steps=1,force_fps=True, display_screen=True) +p.init() + +i=0 + +while (True): + p.reset_game() + state = jeu.getGameState() + state = np.array(list(state.values())) + while(not jeu.game_over()): + + + + qval = model.predict(state.reshape(1,len(state)), batch_size=batchSize) #Learn Q (Q-learning) / model initialise avant (neural-network) + if (random.random() < epsilon): # exploration exploitation strategy + action = np.random.randint(0,2) + else: #choose best action from Q(s,a) values + qval_av_action = [-9999]*2 + + for ac in range(0,2): + qval_av_action[ac] = qval[0][ac] + action = (np.argmax(qval_av_action)) + #Take action, observe new state S' + #Observe reward + reward = p.act(119*action) + if reward == 1: + reaward = 1 + elif reward == -5: + reward = -500 + new_state = jeu.getGameState() + new_state = np.array(list(new_state.values())) + # choose new reward values + + + #Get max_Q(S',a) + newQ = model.predict(new_state.reshape(1,len(state)), batch_size=batchSize) + maxQ = np.max(newQ) + y = np.zeros((1,2)) + y[:] = qval[:] + if reward != -5: #non-terminal state + update = (reward + gamma * maxQ) + else: + update = reward + y[0][action] = update + print("Game #: %s" % (i,)) + model.fit(state.reshape(1, len(state)), y, batch_size=batchSize, nb_epoch=2, verbose=0) + state = new_state + + + # update exploitation / exploration strategy + if epsilon > 0.1: + epsilon -= (1.0/10000) + + # save the model every 1000 epochs + if i==100: + model.save(file_path+"0.dqf") + if i%1000 == 0 and i!=0: + model.save(file_path+str(i/1000)+".dqf") + time.sleep(60) + if i == 100000: + break + + i=i+1 +model.save(file_path+"final.dqf") diff --git a/RandomBird/run.py b/Meliani/run.py similarity index 81% rename from RandomBird/run.py rename to Meliani/run.py index 39b5801..cbaa25d 100644 --- a/RandomBird/run.py +++ b/Meliani/run.py @@ -4,7 +4,7 @@ import numpy as np from FlappyAgent import FlappyPolicy -game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +game = FlappyBird() # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. diff --git a/README.md b/README.md index ce4894f..62fddea 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,22 @@ # RL challenge -Your challenge is to learn to play [Flappy Bird](https://en.wikipedia.org/wiki/Flappy_Bird)! +My challenge is to learn to play [Flappy Bird](https://en.wikipedia.org/wiki/Flappy_Bird)! Flappybird is a side-scrolling game where the agent must successfully nagivate through gaps between pipes. Only two actions in this game: at each time step, either you click and the bird flaps, or you don't click and gravity plays its role. -There are three levels of difficulty in this challenge: -- Learn an optimal policy with hand-crafted features -- Learn an optimal policy with raw variables -- Learn an optimal policy from pixels. - -# Your job - -Your job is to: -