diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e43b0f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/Learner.py b/Learner.py
index 063346b..3fdf473 100644
--- a/Learner.py
+++ b/Learner.py
@@ -1,17 +1,25 @@
-__author__ = 'philippe'
+__author__ = 'vivek'
+
 import World
 import threading
 import time
 
-discount = 0.3
-actions = World.actions
-states = []
-Q = {}
+import random
+
+# Initial Values
+discount = 0.3 # discount
+actions = World.actions # actions
+states = [] # states
+Q = {} # policies / rewards
+
+# Define World
 for i in range(World.x):
+    """ Loads all available states (squares) in the World"""
     for j in range(World.y):
         states.append((i, j))
 
 for state in states:
+    """ For every state, set the default reward for each action to 0.1"""
     temp = {}
     for action in actions:
         temp[action] = 0.1
@@ -19,75 +27,168 @@
     Q[state] = temp
 
 for (i, j, c, w) in World.specials:
+    """ For x, y, color, score in each World.specials list, update the Q reward array so reaching this square through any action earns the new reward"""
     for action in actions:
         Q[(i, j)][action] = w
         World.set_cell_score((i, j), action, w)
 
+# Value Functions
+def max_Q(s):
+    """ For current position, check which of the next available squares provide the highest Q value. If more than one, choose randomly.
+
+    input:
+    s: current position
+    epsilon: value of random actions
+
+    output:
+    best_a : best action
+    best_q: best q value for that action"""
+
+    best_q = None
+    best_a = None
+
+    for a, q in Q[s].items(): # for every action and q value
+        if best_q is None or (q > best_q): # if val is 0 or current q is higher, assign current a, q
+            #print('Checking for an Action...')
+
+            best_q = q
+            best_a = a
+
+    options = [x for x in Q[s].items() if x[1] == best_q] # How many options do we have?
+
+    if len(options) > 1:
+        # If more than option, pick the best one
+        best_a, best_q = options[random.randrange(0,len(options))]
+
+    if best_q < 0.1:
+        # If all the options are bad (less than the regular reward) do something random!
+        best_a, best_q = Q[s].items()[random.randrange(0,len(Q[s].items()))]
+
+    return best_a, best_q
+
 
+
+def inc_Q(s, a, alpha, inc):
+    """ Given the position, action, the learning rate, and inc, set the new cell score.
+
+    We use the alpha to decrease the value of moves over time; the longer a new policy takes, the less its Q value becomes in the q matrix.
+
+    s: current position
+    a: current action
+    alpha: current learning rate
+    inc: r + discount * max_val (World Score + Discount * MaxQ(s).val)
+    """
+
+    # For the specific action
+    Q[s][a] *= 1 - alpha # multiply the Q value for an action by the learning rate
+    Q[s][a] += alpha * inc # add the incoming value of the alpha * action
+
+    World.set_cell_score(s, a, Q[s][a]) # Set the score of getting to the current position using the action to the new Q[s][a]
+
+# Moving Functions
 def do_action(action):
+    """
+    Given an action, make an actual move in the real world.
+
+    s = current player position
+    r = the current score increased by the cost/reward of the next action
+    s2 = updated player position
+    """
     s = World.player
     r = -World.score
+
     if action == actions[0]:
+        # down
         World.try_move(0, -1)
-    elif action == actions[1]:
-        World.try_move(0, 1)
+
     elif action == actions[2]:
+        # left
         World.try_move(-1, 0)
+
+    elif action == actions[1]:
+        # up
+        World.try_move(0, 1)
+
     elif action == actions[3]:
+        # right
         World.try_move(1, 0)
     else:
         return
+
     s2 = World.player
     r += World.score
     return s, action, r, s2
 
 
-def max_Q(s):
-    val = None
-    act = None
-    for a, q in Q[s].items():
-        if val is None or (q > val):
-            val = q
-            act = a
-    return act, val
-
-
-def inc_Q(s, a, alpha, inc):
-    Q[s][a] *= 1 - alpha
-    Q[s][a] += alpha * inc
-    World.set_cell_score(s, a, Q[s][a])
-
-
+# Start Game
 def run():
     global discount
     time.sleep(1)
+
     alpha = 1
+    beta = None
+
     t = 1
+
+    stuck = 0
+    old_s = (0,0)
+
     while True:
-        # Pick the right action
-        s = World.player
-        max_act, max_val = max_Q(s)
-        (s, a, r, s2) = do_action(max_act)
+        s = World.player # Starting Position
+
+        if s == old_s: # Stuck Check
+            stuck += 1
+            #print('Been in the same spot this many times:{}'.format(stuck))
+
+        # Before Move
+        # Current Position
+        #print('Current Position: {},\n Potential Actions: {}\n\n'.format(s, Q[s]))
+
+        # Potential Actions
+        max_act, max_val = max_Q(s) # find the next highest Q from position s
+        #print('Suggested Action: {},\n Q Value: {}\n\n'.format(max_act, max_val))
+
+        # Making Move
+        (s, a, r, s2) = do_action(max_act) # Return the results of an action from old s to new s2
+        #print('New Position: {},\n Current Score: {},\n Actual Move: {},\n Old Position {}\n\n'.format(s2, r, a, s2))
+
+        # Learning Consequences
+        # alpha: learning rate; cost of taking too many moves, if the alpha grows too quickly, bad moves seem less bad over time. (Really is a 'learning rate'; measures consequence of actions)
+        # beta: summary of the reward function, which takes the world score, discount, and best q to update the value of that action
+
+        epsilon = pow(t, -0.005)
+        alpha = pow(t, -0.25)
+        beta =  r + discount * max_val
 
-        # Update Q
-        max_act, max_val = max_Q(s2)
-        inc_Q(s, a, alpha, r + discount * max_val)
+        # Check Values at New Position
+        max_act, max_val = max_Q(s2) # return the score at s2
+        #print('Suggested Action at New Position: {},\nQ Value at New Position: {}\n'.format(max_act, max_val)) # seems to be the same position. Hmm....
+
+        # Update Q Matrix
+        inc_Q(s, a, alpha, beta)
+        #print('Updating Q value at Position: {},\n for Action: {}, \n with Alpha {},\n and Beta {}\n'.format(s, a, alpha, beta, max_val))
 
-        # Check if the game has restarted
         t += 1.0
-        if World.has_restarted():
+        old_s = s
+
+        print('Moves {} | Score {}'.format(t, round(World.score, 2)))
+
+
+        if World.has_restarted() or (t > 100) or (stuck > 10): # 500 tries per life, or you get stuck more than 10 times
             World.restart_game()
             time.sleep(0.01)
+
             t = 1.0
+            stuck = 0
 
-        # Update the learning rate
-        alpha = pow(t, -0.1)
 
         # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
-        time.sleep(0.1)
+        time.sleep(0.01) # seconds between moves
+        # sanic
 
 
 t = threading.Thread(target=run)
 t.daemon = True
 t.start()
+
 World.start_game()
diff --git a/README.md b/README.md
index 68241c8..a2cbbec 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
-# q_learning_demo
+# Q Learning Demo
+
 This is the code for "How to use Q Learning in Video Games Easily" by Siraj Raval on Youtube
 
 ##Overview
 
 This is the associated code for [this](https://youtu.be/A5eihauRQvo) video on Youtube by Siraj Raval. This is a simple example of a type of [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning)
-called [Q learning](https://en.wikipedia.org/wiki/Q-learning). 
+called [Q learning](https://en.wikipedia.org/wiki/Q-learning).
 
 	● Rules: The agent (yellow box) has to reach one of the goals to end the game (green or red cell).
 	● Rewards: Each step gives a negative reward of -0.04. The red cell gives a negative reward of -1. The green one gives a positive reward of +1.
@@ -16,24 +17,30 @@ called [Q learning](https://en.wikipedia.org/wiki/Q-learning).
 -Python 2.7
 -tkinter
 
-If on Ubuntu you can install tkinter for python2.7 with
-$sudo apt-get install python-tk
-
 ##Usage
 
-Run `python Learner.py` in terminal to see the the bot in action. It'll find the optimal strategy pretty fast (like in 15 seconds)
+Run `python Learner.py` in terminal to see the the bot in action.
 
 ##Challenge
 
-The challenge for this video is to 
+The challenge for this video is to
 
-* modify the the game world so that it's bigger 
+* modify the the game world so that it's bigger
 * add more obstacles
 * have the bot start in a different position
 
 **Bonus points if you modify the bot in some way that makes it more efficient**
 
-#Due Date is Thursday at noon PST January 12th 2017
+##Solution
+
+My solution features the following
+
+* `random_start()` start anywhere on the board.
+* `difficulty` parameter that scales up the number of walls.
+* `create_reds()`,`create_greens()` and `create_walls()` that add more special squares to help and hinder the agent.
+* `(x,y)` scaling for larger boards
+* `max_q()` changes for more randomized/Q-sensitive agent decision making.
+* Tons more documentation (for my own learning)
 
 ##Credits
 
diff --git a/World.py b/World.py
index dee8cc9..b5ff9eb 100644
--- a/World.py
+++ b/World.py
@@ -1,25 +1,79 @@
-__author__ = 'philippe'
+__author__ = 'vivek'
+
 from Tkinter import *
+import random
+
 master = Tk()
 
+# Aesthetics
 triangle_size = 0.1
+Width = 10 # pix    el width, made smaller to fit
+(x, y) = (50, 50) # board dimensions, scale up or down as necessary
+board = Canvas(master, width=x*Width, height=y*Width) # make the board
+
+# Difficulty
+difficulty = 2 # Higher this is, the harder the maze
+walls_number = int(x * difficulty) # How many walls to generate?
+
+# Rewards
 cell_score_min = -0.2
 cell_score_max = 0.2
-Width = 100
-(x, y) = (5, 5)
+walk_reward = -0.04
+
+# Actions
 actions = ["up", "down", "left", "right"]
 
-board = Canvas(master, width=x*Width, height=y*Width)
-player = (0, y-1)
+# Initial conditions
+def random_start():
+    """Start anywhere!"""
+    return(random.randrange(2,y-2), random.randrange(2,y-2)) # starting point
+
+player = random_start()
 score = 1
 restart = False
+
+# Rewards
+cell_score_min = -0.2
+cell_score_max = 0.2
 walk_reward = -0.04
 
-walls = [(1, 1), (1, 2), (2, 1), (2, 2)]
-specials = [(4, 1, "red", -1), (4, 0, "green", 1)]
+# Actions
+actions = ["up", "down", "left", "right"]
+
+# Square Functions
+def create_walls(walls, x=x, y=y):
+    """Let's make some walls!"""
+    wall_list = [(random.randrange(4, x), random.randrange(4, y)) for i in range(0,walls)]
+
+    if (0,0) in wall_list: # remove origin
+        wall_list.remove((0,0))
+
+    return(wall_list)
+
+def create_reds(x=x, y=y):
+    """ Lets make every other sides completely wrong to help it out"""
+    wall_list = []
+
+    wall_list += [(x-1, i, "red", -1) for i in range(0,y)] # right
+    wall_list += [(0, i, "red", -1) for i in range(2,y)] # left
+    wall_list += [(i, 0, "red", -1) for i in range(2,x)] # top
+    wall_list += [(i, y-1, "red", -1) for i in range(0,x)] # bottom
+
+    return(wall_list)
+
+def create_greens():
+    """ Lets make a little green corner for mercy"""
+    greens = [(0, 0, "green", 1), (1, 0, "green", 1), (0, 1, "green", 1)]
+
+    return(greens)
+
+# Special Squares
+specials = [] + create_greens() + create_reds() # x, y, color, score
+walls = create_walls(walls_number) # How many random walls?
 cell_scores = {}
 
 
+# Board Design
 def create_triangle(i, j, action):
     if action == actions[0]:
         return board.create_polygon((i+0.5-triangle_size)*Width, (j+triangle_size)*Width,
@@ -60,6 +114,7 @@ def render_grid():
 render_grid()
 
 
+# Scoring
 def set_cell_score(state, action, val):
     global cell_score_min, cell_score_max
     triangle = cell_scores[state][action]
@@ -73,14 +128,18 @@ def set_cell_score(state, action, val):
     color = "#" + red + green + "00"
     board.itemconfigure(triangle, fill=color)
 
-
+# Moving
 def try_move(dx, dy):
     global player, x, y, score, walk_reward, me, restart
+
     if restart == True:
         restart_game()
+
     new_x = player[0] + dx
     new_y = player[1] + dy
+
     score += walk_reward
+
     if (new_x >= 0) and (new_x < x) and (new_y >= 0) and (new_y < y) and not ((new_x, new_y) in walls):
         board.coords(me, new_x*Width+Width*2/10, new_y*Width+Width*2/10, new_x*Width+Width*8/10, new_y*Width+Width*8/10)
         player = (new_x, new_y)
@@ -89,14 +148,14 @@ def try_move(dx, dy):
             score -= walk_reward
             score += w
             if score > 0:
-                print "Success! score: ", score
+                print "\nSuccess! | Score: ", score
             else:
-                print "Fail! score: ", score
+                print "\n Fail! | Score: ", score
             restart = True
             return
     #print "score: ", score
 
-
+# Moving
 def call_up(event):
     try_move(0, -1)
 
@@ -112,17 +171,19 @@ def call_left(event):
 def call_right(event):
     try_move(1, 0)
 
-
+# Restarting
 def restart_game():
     global player, score, me, restart
-    player = (0, y-1)
+    player = random_start()
     score = 1
     restart = False
     board.coords(me, player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, player[0]*Width+Width*8/10, player[1]*Width+Width*8/10)
 
+
 def has_restarted():
     return restart
 
+# Binding
 master.bind("<Up>", call_up)
 master.bind("<Down>", call_down)
 master.bind("<Right>", call_right)
@@ -133,6 +194,6 @@ def has_restarted():
 
 board.grid(row=0, column=0)
 
-
+# Starting
 def start_game():
     master.mainloop()
diff --git a/World.pyc b/World.pyc
new file mode 100644
index 0000000..9b8e40f
Binary files /dev/null and b/World.pyc differ