Commit 97b3a813 authored by Steven Cordwell's avatar Steven Cordwell

various fixes, especially ValueIteration.boundIter()

parent a12de419
......@@ -84,7 +84,11 @@ mdperr = {
"PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" :
"PyMDPtoolbox: The number of states S and the number of actions A must be "
"greater than 1."
"greater than 1.",
"discount_rng" :
"PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
"PyMDPtoolbox: The maximum number of iterations must be greater than 0"
}
def exampleForest(S=3, r1=4, r2=2, p=0.1):
......@@ -234,7 +238,7 @@ def exampleRand(S, A, is_sparse=False, mask=None):
class MDP(object):
"""The Markov Decision Problem Toolbox."""
def __init__(self):
def __init__(self, transitions, reward, discount, max_iter):
""""""
# the verbosity is by default turned off
self.verbose = False
......@@ -242,22 +246,22 @@ class MDP(object):
# Initially the time taken to perform the computations is set to None
self.time = None
# These are some placeholder attributes that need to be overridden in
# child classes.
# S is the number of states
self.S = None
# A is the number of actions
self.A = None
# R is the reward matrix
self.R = None
# P is the probability-transition matrix
self.P = None
# policy is the optimal control policy
self.policy = None
# value is a vector of expected future values for each state
self.value = None
# discount is the per time step discount factor
self.discount = None
if (discount <= 0) or (discount > 1):
raise ValueError(mdperr["discount_rng"])
else:
self.discount = discount
if (max_iter <= 0):
raise ValueError(mdperr["maxi_min"])
else:
self.max_iter = max_iter
self.check(transitions, reward)
self.computePR(transitions, reward)
# set the initial iteration count to zero
self.iter = 0
def bellmanOperator(self):
"""
......@@ -532,10 +536,6 @@ class MDP(object):
"""
return (W.max() - W.min())
def setup(self):
"""A helper function to perform various checks and preparations."""
pass
def setSilent(self):
"""Ask for running resolution functions of the MDP Toolbox in silent
mode.
......@@ -897,7 +897,7 @@ class QLearning(MDP):
Then the length of this vector for the default value of N is 100
(N/100).
Examples
ExamplesPP[:, aa] = self.P[aa][:, ss]
---------
>>> import mdp
>>> P, R = mdp.exampleForest()
......@@ -934,21 +934,13 @@ class QLearning(MDP):
"""Evaluation of the matrix Q, using the Q learning algorithm
"""
MDP.__init__(self)
# Check of arguments
if (discount <= 0) or (discount >= 1):
raise ValueError("MDP Toolbox Error: Discount rate must be in ]0,1[")
elif (n_iter < 10000):
raise ValueError("MDP Toolbox Error: n_iter must be greater than 10000")
self.check(transitions, reward)
self.computePR(transitions, reward)
# The following check won't be done in MDP()'s initialisation, so let's
# do it here
if (n_iter < 10000):
raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000")
self.discount = discount
self.n_iter = n_iter
# after this n_iter will be known as self.max_iter
MDP.__init__(self, transitions, reward, discount, n_iter)
# Initialisations
self.Q = zeros((self.S, self.A))
......@@ -964,7 +956,7 @@ class QLearning(MDP):
# initial state choice
# s = randint(0, self.S - 1)
for n in range(self.n_iter):
for n in range(self.max_iter):
# Reinitialisation of trajectories every 100 transitions
if ((n % 100) == 0):
......@@ -1016,12 +1008,17 @@ class QLearning(MDP):
self.policy = self.Q.argmax(axis=1)
self.time = time() - self.time
# rather than report that we have not done any iterations, assign the
# value of n_iter to self.iter
self.iter = self.max_iter
class RelativeValueIteration(MDP):
"""Resolution of MDP with average reward with relative value iteration
algorithm.
"""
raise NotImplementedError("This class has not been implemented yet.")
pass
#raise NotImplementedError("This class has not been implemented yet.")
class ValueIteration(MDP):
"""
......@@ -1141,11 +1138,7 @@ class ValueIteration(MDP):
def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0):
"""Resolution of discounted MDP with value iteration algorithm."""
MDP.__init__(self)
self.check(transitions, reward)
self.computePR(transitions, reward)
MDP.__init__(self, transitions, reward, discount, max_iter)
# initialization of optional arguments
if (initial_value == 0):
......@@ -1153,23 +1146,19 @@ class ValueIteration(MDP):
else:
if (initial_value.size != self.S):
raise ValueError("The initial value must be length S")
self.value = matrix(initial_value)
self.discount = discount
if (discount < 1):
# compute a bound for the number of iterations
#self.max_iter = self.boundIter(epsilon)
self.max_iter = 5000
# computation of threshold of variation for V for an epsilon-optimal policy
else:
self.value = matrix(initial_value)
if (self.discount < 1):
# compute a bound for the number of iterations and update the
# stored value of self.max_iter
self.boundIter(epsilon)
# computation of threshold of variation for V for an epsilon-
# optimal policy
self.thresh = epsilon * (1 - self.discount) / self.discount
else: # discount == 1
# bound for the number of iterations
self.max_iter = max_iter
# threshold of variation for V for an epsilon-optimal policy
self.thresh = epsilon
self.iter = 0
self.thresh = epsilon
def boundIter(self, epsilon):
"""Computes a bound for the number of iterations for the value iteration
......@@ -1195,7 +1184,7 @@ class ValueIteration(MDP):
h = zeros(self.S)
for ss in range(self.S):
PP = zeros((self.S, self.A))
PP = matrix(zeros((self.S, self.A)))
for aa in range(self.A):
PP[:, aa] = self.P[aa][:, ss]
# the function "min()" without any arguments finds the
......@@ -1203,10 +1192,13 @@ class ValueIteration(MDP):
h[ss] = PP.min()
k = 1 - h.sum()
V1 = self.bellmanOperator(self.value)
Vprev = self.value
self.bellmanOperator()
# p 201, Proposition 6.6.5
max_iter = log( (epsilon * (1 - self.discount) / self.discount) / self.getSpan(V1 - self.value) ) / log(self.discount * k)
return ceil(max_iter)
max_iter = log( (epsilon * (1 - self.discount) / self.discount) / self.getSpan(self.value - Vprev) ) / log(self.discount * k)
self.value = Vprev
self.max_iter = ceil(max_iter)
def iterate(self):
"""
......
......@@ -116,7 +116,7 @@ def test_exampleRand_dense_shape():
def test_exampleRand_dense_check():
P, R = exampleRand(STATES, ACTIONS)
inst.check(P, R)
assert inst.check(P, R) == None
def test_exampleRand_sparse_shape():
P, R = exampleRand(STATES, ACTIONS, is_sparse=True)
......@@ -125,7 +125,7 @@ def test_exampleRand_sparse_shape():
def test_exampleRand_sparse_check():
P, R = exampleRand(STATES, ACTIONS, is_sparse=True)
inst.check(P, R)
assert inst.check(P, R) == None
# ValueIteration
......@@ -138,6 +138,12 @@ def test_ValueIteration():
assert (inst.policy == (1, 0))
assert (inst.iter == 26)
def test_ValueIteration_boundIter():
P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
R = array([[5, 10], [-1, 2]])
inst = ValueIteration(P, R, 0.9, 0.01)
assert (inst.max_iter == 28)
def test_JacksCarRental():
S = 21 ** 2
A = 11
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment