diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Learner.py b/Learner.py index 063346b..3fdf473 100644 --- a/Learner.py +++ b/Learner.py @@ -1,17 +1,25 @@ -__author__ = 'philippe' +__author__ = 'vivek' + import World import threading import time -discount = 0.3 -actions = World.actions -states = [] -Q = {} +import random + +# Initial Values +discount = 0.3 # discount +actions = World.actions # actions +states = [] # states +Q = {} # policies / rewards + +# Define World for i in range(World.x): + """ Loads all available states (squares) in the World""" for j in range(World.y): states.append((i, j)) for state in states: + """ For every state, set the default reward for each action to 0.1""" temp = {} for action in actions: temp[action] = 0.1 @@ -19,75 +27,168 @@ Q[state] = temp for (i, j, c, w) in World.specials: + """ For x, y, color, score in each World.specials list, update the Q reward array so reaching this square through any action earns the new reward""" for action in actions: Q[(i, j)][action] = w World.set_cell_score((i, j), action, w) +# Value Functions +def max_Q(s): + """ For current position, check which of the next available squares provide the highest Q value. If more than one, choose randomly. + + input: + s: current position + epsilon: value of random actions + + output: + best_a : best action + best_q: best q value for that action""" + + best_q = None + best_a = None + + for a, q in Q[s].items(): # for every action and q value + if best_q is None or (q > best_q): # if val is 0 or current q is higher, assign current a, q + #print('Checking for an Action...') + + best_q = q + best_a = a + + options = [x for x in Q[s].items() if x[1] == best_q] # How many options do we have? + + if len(options) > 1: + # If more than option, pick the best one + best_a, best_q = options[random.randrange(0,len(options))] + + if best_q < 0.1: + # If all the options are bad (less than the regular reward) do something random! + best_a, best_q = Q[s].items()[random.randrange(0,len(Q[s].items()))] + + return best_a, best_q + + +def inc_Q(s, a, alpha, inc): + """ Given the position, action, the learning rate, and inc, set the new cell score. + + We use the alpha to decrease the value of moves over time; the longer a new policy takes, the less its Q value becomes in the q matrix. + + s: current position + a: current action + alpha: current learning rate + inc: r + discount * max_val (World Score + Discount * MaxQ(s).val) + """ + + # For the specific action + Q[s][a] *= 1 - alpha # multiply the Q value for an action by the learning rate + Q[s][a] += alpha * inc # add the incoming value of the alpha * action + + World.set_cell_score(s, a, Q[s][a]) # Set the score of getting to the current position using the action to the new Q[s][a] + +# Moving Functions def do_action(action): + """ + Given an action, make an actual move in the real world. + + s = current player position + r = the current score increased by the cost/reward of the next action + s2 = updated player position + """ s = World.player r = -World.score + if action == actions[0]: + # down World.try_move(0, -1) - elif action == actions[1]: - World.try_move(0, 1) + elif action == actions[2]: + # left World.try_move(-1, 0) + + elif action == actions[1]: + # up + World.try_move(0, 1) + elif action == actions[3]: + # right World.try_move(1, 0) else: return + s2 = World.player r += World.score return s, action, r, s2 -def max_Q(s): - val = None - act = None - for a, q in Q[s].items(): - if val is None or (q > val): - val = q - act = a - return act, val - - -def inc_Q(s, a, alpha, inc): - Q[s][a] *= 1 - alpha - Q[s][a] += alpha * inc - World.set_cell_score(s, a, Q[s][a]) - - +# Start Game def run(): global discount time.sleep(1) + alpha = 1 + beta = None + t = 1 + + stuck = 0 + old_s = (0,0) + while True: - # Pick the right action - s = World.player - max_act, max_val = max_Q(s) - (s, a, r, s2) = do_action(max_act) + s = World.player # Starting Position + + if s == old_s: # Stuck Check + stuck += 1 + #print('Been in the same spot this many times:{}'.format(stuck)) + + # Before Move + # Current Position + #print('Current Position: {},\n Potential Actions: {}\n\n'.format(s, Q[s])) + + # Potential Actions + max_act, max_val = max_Q(s) # find the next highest Q from position s + #print('Suggested Action: {},\n Q Value: {}\n\n'.format(max_act, max_val)) + + # Making Move + (s, a, r, s2) = do_action(max_act) # Return the results of an action from old s to new s2 + #print('New Position: {},\n Current Score: {},\n Actual Move: {},\n Old Position {}\n\n'.format(s2, r, a, s2)) + + # Learning Consequences + # alpha: learning rate; cost of taking too many moves, if the alpha grows too quickly, bad moves seem less bad over time. (Really is a 'learning rate'; measures consequence of actions) + # beta: summary of the reward function, which takes the world score, discount, and best q to update the value of that action + + epsilon = pow(t, -0.005) + alpha = pow(t, -0.25) + beta = r + discount * max_val - # Update Q - max_act, max_val = max_Q(s2) - inc_Q(s, a, alpha, r + discount * max_val) + # Check Values at New Position + max_act, max_val = max_Q(s2) # return the score at s2 + #print('Suggested Action at New Position: {},\nQ Value at New Position: {}\n'.format(max_act, max_val)) # seems to be the same position. Hmm.... + + # Update Q Matrix + inc_Q(s, a, alpha, beta) + #print('Updating Q value at Position: {},\n for Action: {}, \n with Alpha {},\n and Beta {}\n'.format(s, a, alpha, beta, max_val)) - # Check if the game has restarted t += 1.0 - if World.has_restarted(): + old_s = s + + print('Moves {} | Score {}'.format(t, round(World.score, 2))) + + + if World.has_restarted() or (t > 100) or (stuck > 10): # 500 tries per life, or you get stuck more than 10 times World.restart_game() time.sleep(0.01) + t = 1.0 + stuck = 0 - # Update the learning rate - alpha = pow(t, -0.1) # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST. - time.sleep(0.1) + time.sleep(0.01) # seconds between moves + # sanic t = threading.Thread(target=run) t.daemon = True t.start() + World.start_game() diff --git a/README.md b/README.md index 68241c8..a2cbbec 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -# q_learning_demo +# Q Learning Demo + This is the code for "How to use Q Learning in Video Games Easily" by Siraj Raval on Youtube ##Overview This is the associated code for [this](https://youtu.be/A5eihauRQvo) video on Youtube by Siraj Raval. This is a simple example of a type of [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning) -called [Q learning](https://en.wikipedia.org/wiki/Q-learning). +called [Q learning](https://en.wikipedia.org/wiki/Q-learning). ● Rules: The agent (yellow box) has to reach one of the goals to end the game (green or red cell). ● Rewards: Each step gives a negative reward of -0.04. The red cell gives a negative reward of -1. The green one gives a positive reward of +1. @@ -16,24 +17,30 @@ called [Q learning](https://en.wikipedia.org/wiki/Q-learning). -Python 2.7 -tkinter -If on Ubuntu you can install tkinter for python2.7 with -$sudo apt-get install python-tk - ##Usage -Run `python Learner.py` in terminal to see the the bot in action. It'll find the optimal strategy pretty fast (like in 15 seconds) +Run `python Learner.py` in terminal to see the the bot in action. ##Challenge -The challenge for this video is to +The challenge for this video is to -* modify the the game world so that it's bigger +* modify the the game world so that it's bigger * add more obstacles * have the bot start in a different position **Bonus points if you modify the bot in some way that makes it more efficient** -#Due Date is Thursday at noon PST January 12th 2017 +##Solution + +My solution features the following + +* `random_start()` start anywhere on the board. +* `difficulty` parameter that scales up the number of walls. +* `create_reds()`,`create_greens()` and `create_walls()` that add more special squares to help and hinder the agent. +* `(x,y)` scaling for larger boards +* `max_q()` changes for more randomized/Q-sensitive agent decision making. +* Tons more documentation (for my own learning) ##Credits diff --git a/World.py b/World.py index dee8cc9..b5ff9eb 100644 --- a/World.py +++ b/World.py @@ -1,25 +1,79 @@ -__author__ = 'philippe' +__author__ = 'vivek' + from Tkinter import * +import random + master = Tk() +# Aesthetics triangle_size = 0.1 +Width = 10 # pix el width, made smaller to fit +(x, y) = (50, 50) # board dimensions, scale up or down as necessary +board = Canvas(master, width=x*Width, height=y*Width) # make the board + +# Difficulty +difficulty = 2 # Higher this is, the harder the maze +walls_number = int(x * difficulty) # How many walls to generate? + +# Rewards cell_score_min = -0.2 cell_score_max = 0.2 -Width = 100 -(x, y) = (5, 5) +walk_reward = -0.04 + +# Actions actions = ["up", "down", "left", "right"] -board = Canvas(master, width=x*Width, height=y*Width) -player = (0, y-1) +# Initial conditions +def random_start(): + """Start anywhere!""" + return(random.randrange(2,y-2), random.randrange(2,y-2)) # starting point + +player = random_start() score = 1 restart = False + +# Rewards +cell_score_min = -0.2 +cell_score_max = 0.2 walk_reward = -0.04 -walls = [(1, 1), (1, 2), (2, 1), (2, 2)] -specials = [(4, 1, "red", -1), (4, 0, "green", 1)] +# Actions +actions = ["up", "down", "left", "right"] + +# Square Functions +def create_walls(walls, x=x, y=y): + """Let's make some walls!""" + wall_list = [(random.randrange(4, x), random.randrange(4, y)) for i in range(0,walls)] + + if (0,0) in wall_list: # remove origin + wall_list.remove((0,0)) + + return(wall_list) + +def create_reds(x=x, y=y): + """ Lets make every other sides completely wrong to help it out""" + wall_list = [] + + wall_list += [(x-1, i, "red", -1) for i in range(0,y)] # right + wall_list += [(0, i, "red", -1) for i in range(2,y)] # left + wall_list += [(i, 0, "red", -1) for i in range(2,x)] # top + wall_list += [(i, y-1, "red", -1) for i in range(0,x)] # bottom + + return(wall_list) + +def create_greens(): + """ Lets make a little green corner for mercy""" + greens = [(0, 0, "green", 1), (1, 0, "green", 1), (0, 1, "green", 1)] + + return(greens) + +# Special Squares +specials = [] + create_greens() + create_reds() # x, y, color, score +walls = create_walls(walls_number) # How many random walls? cell_scores = {} +# Board Design def create_triangle(i, j, action): if action == actions[0]: return board.create_polygon((i+0.5-triangle_size)*Width, (j+triangle_size)*Width, @@ -60,6 +114,7 @@ def render_grid(): render_grid() +# Scoring def set_cell_score(state, action, val): global cell_score_min, cell_score_max triangle = cell_scores[state][action] @@ -73,14 +128,18 @@ def set_cell_score(state, action, val): color = "#" + red + green + "00" board.itemconfigure(triangle, fill=color) - +# Moving def try_move(dx, dy): global player, x, y, score, walk_reward, me, restart + if restart == True: restart_game() + new_x = player[0] + dx new_y = player[1] + dy + score += walk_reward + if (new_x >= 0) and (new_x < x) and (new_y >= 0) and (new_y < y) and not ((new_x, new_y) in walls): board.coords(me, new_x*Width+Width*2/10, new_y*Width+Width*2/10, new_x*Width+Width*8/10, new_y*Width+Width*8/10) player = (new_x, new_y) @@ -89,14 +148,14 @@ def try_move(dx, dy): score -= walk_reward score += w if score > 0: - print "Success! score: ", score + print "\nSuccess! | Score: ", score else: - print "Fail! score: ", score + print "\n Fail! | Score: ", score restart = True return #print "score: ", score - +# Moving def call_up(event): try_move(0, -1) @@ -112,17 +171,19 @@ def call_left(event): def call_right(event): try_move(1, 0) - +# Restarting def restart_game(): global player, score, me, restart - player = (0, y-1) + player = random_start() score = 1 restart = False board.coords(me, player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, player[0]*Width+Width*8/10, player[1]*Width+Width*8/10) + def has_restarted(): return restart +# Binding master.bind("", call_up) master.bind("", call_down) master.bind("", call_right) @@ -133,6 +194,6 @@ def has_restarted(): board.grid(row=0, column=0) - +# Starting def start_game(): master.mainloop() diff --git a/World.pyc b/World.pyc new file mode 100644 index 0000000..9b8e40f Binary files /dev/null and b/World.pyc differ