Fixed softmax policy errors

8Gitbrix · 8Gitbrix · commit 4cd279fc466a · 2017-01-13T21:59:23.000-05:00
Fixed policy function in learn.py and sarsa.py. Also updated the
descriptions in README.md
diff --git a/README.md b/README.md
@@ -2,13 +2,15 @@
 Implementation of Reinforcement Learning Algorithms, mainly in openAI gym
 
 ### Algorithms Implemented so far:
-* Q-learning with Boltzmann Action Selection: qlearn.py  (still working on parameters)
+* Q-learning with Softmax Action Selection: qlearn.py
 * Q-learning with e-greedy Action Selection: qlearn2.py
+* SARSA with with Softmax Action Selection: sarsa.py
 
 
 ### Evaluations:
 * qlearn2.py: https://gym.openai.com/evaluations/eval_S6ldgxysSay2eidRk9eHNA
+* sarsa.py: https://gym.openai.com/evaluations/eval_D7pDK88ESHCdiZR7YZeqMQ
 
 ### Running:
-openAI gym, and python3 are required to run any of the programs.
+openAI gym, numpy, and python3 are required to run any of the programs.
 To run enter: `python3 filename.py` on command line
diff --git a/qlearn.py b/qlearn.py
@@ -2,6 +2,7 @@
 ### Q-Learning with Boltzmann Exploration
 from math import exp
 import random
+import numpy as np
 import gym
 from gym import wrappers
 env = gym.make('FrozenLake8x8-v0')
@@ -20,17 +21,14 @@ def update(state, action, reward, nextstate, alpha, gamma):
 
 # used to select action to take
 def policy(state, t):
-    actions = {}
-    sm = sum([exp(q[(state,a)]/t) for a in range(env.action_space.n)])
-    for a in range(env.action_space.n): # store each probability(action: a|state)
-        actions[a] = exp(q[(state,a)]/t)/sm # boltzmann equation
+    p = np.array([q[(state,x)]/t for x in range(env.action_space.n)])
+    prob_actions = np.exp(p) / np.sum(np.exp(p))
     cumulative_probability = 0.0
-    random_choice = random.uniform(0,sm)
-    for action, prob in sorted(actions.items(), key =lambda x: x[1]):
-        cumulative_probability += prob
-        if cumulative_probability >= random_choice:
-            break
-    return action
+    choice = random.uniform(0,1)
+    for a,pr in enumerate(prob_actions):
+        cumulative_probability += pr
+        if cumulative_probability > choice:
+            return a
 
 alpha = 0.8
 gamma = 0.999
diff --git a/sarsa.py b/sarsa.py
@@ -1,30 +1,30 @@
 ### Ashwin Jeyaseelan
 ### Sarsa (an on-policy TD control algorithm)
 import gym
-
+from gym import wrappers
 import random
+import numpy as np
 env = gym.make('Taxi-v1')
-from math import exp
-
+env = gym.wrappers.Monitor(env, "gym_results", force=True)
 
 Q = {}
 for s in range(env.observation_space.n):
     for a in range(env.action_space.n):
         Q[(s,a)] = 0.0
 
 def policy(state, t):
-    sm = sum([exp(Q[(state,a)]/t) for a in range(env.action_space.n)])
-    random_choice = random.uniform(0,sm)
+    p = np.array([Q[(state,x)]/t for x in range(env.action_space.n)])
+    prob_actions = np.exp(p) / np.sum(np.exp(p))
     cumulative_probability = 0.0
+    choice = random.uniform(0,1)
+    for a,pr in enumerate(prob_actions):
+        cumulative_probability += pr
+        if cumulative_probability > choice:
+            return a
 
-    for x in range(env.action_space.n):
-        cumulative_probability += exp(Q[(state,x)]/t)
-        if cumulative_probability > random_choice:
-            return x
-
-alpha = 0.45
+alpha = 0.85
 gamma = 0.90
-t = 2.0
+t = 4.0
 
 for _ in range(4000):
     r = 0 # r keeps track of accumulated score (used to measure performance at each episode!)
@@ -38,10 +38,10 @@ def policy(state, t):
 
         action = action2
         state = state2
-
         r += reward
 
         if done:
+            t = 1.00
             break
 
     print("total reward: ", r)