Skip to content

Commit 4cd279f

Browse files
committed
Fixed softmax policy errors
Fixed policy function in learn.py and sarsa.py. Also updated the descriptions in README.md
1 parent dc6156b commit 4cd279f

File tree

3 files changed

+25
-25
lines changed

3 files changed

+25
-25
lines changed

README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
Implementation of Reinforcement Learning Algorithms, mainly in openAI gym
33

44
### Algorithms Implemented so far:
5-
* Q-learning with Boltzmann Action Selection: qlearn.py (still working on parameters)
5+
* Q-learning with Softmax Action Selection: qlearn.py
66
* Q-learning with e-greedy Action Selection: qlearn2.py
7+
* SARSA with with Softmax Action Selection: sarsa.py
78

89

910
### Evaluations:
1011
* qlearn2.py: https://gym.openai.com/evaluations/eval_S6ldgxysSay2eidRk9eHNA
12+
* sarsa.py: https://gym.openai.com/evaluations/eval_D7pDK88ESHCdiZR7YZeqMQ
1113

1214
### Running:
13-
openAI gym, and python3 are required to run any of the programs.
15+
openAI gym, numpy, and python3 are required to run any of the programs.
1416
To run enter: `python3 filename.py` on command line

qlearn.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
### Q-Learning with Boltzmann Exploration
33
from math import exp
44
import random
5+
import numpy as np
56
import gym
67
from gym import wrappers
78
env = gym.make('FrozenLake8x8-v0')
@@ -20,17 +21,14 @@ def update(state, action, reward, nextstate, alpha, gamma):
2021

2122
# used to select action to take
2223
def policy(state, t):
23-
actions = {}
24-
sm = sum([exp(q[(state,a)]/t) for a in range(env.action_space.n)])
25-
for a in range(env.action_space.n): # store each probability(action: a|state)
26-
actions[a] = exp(q[(state,a)]/t)/sm # boltzmann equation
24+
p = np.array([q[(state,x)]/t for x in range(env.action_space.n)])
25+
prob_actions = np.exp(p) / np.sum(np.exp(p))
2726
cumulative_probability = 0.0
28-
random_choice = random.uniform(0,sm)
29-
for action, prob in sorted(actions.items(), key =lambda x: x[1]):
30-
cumulative_probability += prob
31-
if cumulative_probability >= random_choice:
32-
break
33-
return action
27+
choice = random.uniform(0,1)
28+
for a,pr in enumerate(prob_actions):
29+
cumulative_probability += pr
30+
if cumulative_probability > choice:
31+
return a
3432

3533
alpha = 0.8
3634
gamma = 0.999

sarsa.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
11
### Ashwin Jeyaseelan
22
### Sarsa (an on-policy TD control algorithm)
33
import gym
4-
4+
from gym import wrappers
55
import random
6+
import numpy as np
67
env = gym.make('Taxi-v1')
7-
from math import exp
8-
8+
env = gym.wrappers.Monitor(env, "gym_results", force=True)
99

1010
Q = {}
1111
for s in range(env.observation_space.n):
1212
for a in range(env.action_space.n):
1313
Q[(s,a)] = 0.0
1414

1515
def policy(state, t):
16-
sm = sum([exp(Q[(state,a)]/t) for a in range(env.action_space.n)])
17-
random_choice = random.uniform(0,sm)
16+
p = np.array([Q[(state,x)]/t for x in range(env.action_space.n)])
17+
prob_actions = np.exp(p) / np.sum(np.exp(p))
1818
cumulative_probability = 0.0
19+
choice = random.uniform(0,1)
20+
for a,pr in enumerate(prob_actions):
21+
cumulative_probability += pr
22+
if cumulative_probability > choice:
23+
return a
1924

20-
for x in range(env.action_space.n):
21-
cumulative_probability += exp(Q[(state,x)]/t)
22-
if cumulative_probability > random_choice:
23-
return x
24-
25-
alpha = 0.45
25+
alpha = 0.85
2626
gamma = 0.90
27-
t = 2.0
27+
t = 4.0
2828

2929
for _ in range(4000):
3030
r = 0 # r keeps track of accumulated score (used to measure performance at each episode!)
@@ -38,10 +38,10 @@ def policy(state, t):
3838

3939
action = action2
4040
state = state2
41-
4241
r += reward
4342

4443
if done:
44+
t = 1.00
4545
break
4646

4747
print("total reward: ", r)

0 commit comments

Comments
 (0)