Commit 0660caca by Steven Cordwell

### completed RelativeValueIteration class to a useful state

parent 4f17fb5f
 ... @@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1): ... @@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1): array([[[ 0.1, 0.9, 0. ], array([[[ 0.1, 0.9, 0. ], [ 0.1, 0. , 0.9], [ 0.1, 0. , 0.9], [ 0.1, 0. , 0.9]], [ 0.1, 0. , 0.9]], [[ 1. , 0. , 0. ], [[ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ]]]) [ 1. , 0. , 0. ]]]) ... @@ -429,12 +428,12 @@ def getSpan(W): ... @@ -429,12 +428,12 @@ def getSpan(W): class MDP(object): class MDP(object): """The Markov Decision Problem Toolbox.""" """The Markov Decision Problem Toolbox.""" def __init__(self, transitions, reward, discount, max_iter): def __init__(self, transitions, reward, discount, epsilon, max_iter): """""" """""" # if the discount is None then the algorithm is assumed to not use it # if the discount is None then the algorithm is assumed to not use it # in its computations # in its computations if (type(discount) is int) or (type(discount) is float): if type(discount) in (int, float): if (discount <= 0) or (discount > 1): if (discount <= 0) or (discount > 1): raise ValueError(mdperr["discount_rng"]) raise ValueError(mdperr["discount_rng"]) else: else: ... @@ -448,8 +447,8 @@ class MDP(object): ... @@ -448,8 +447,8 @@ class MDP(object): # if the max_iter is None then the algorithm is assumed to not use it # if the max_iter is None then the algorithm is assumed to not use it # in its computations # in its computations if (type(max_iter) is int) or (type(max_iter) is float): if type(max_iter) in (int, float): if (max_iter <= 0): if max_iter <= 0: raise ValueError(mdperr["maxi_min"]) raise ValueError(mdperr["maxi_min"]) else: else: self.max_iter = max_iter self.max_iter = max_iter ... @@ -457,6 +456,13 @@ class MDP(object): ... @@ -457,6 +456,13 @@ class MDP(object): raise ValueError("PyMDPtoolbox: max_iter must be a positive real "\ raise ValueError("PyMDPtoolbox: max_iter must be a positive real "\ "number greater than zero.") "number greater than zero.") if type(epsilon) in (int, float): if epsilon <= 0: raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") elif not epsilon is None: raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\ "number greater than zero.") # we run a check on P and R to make sure they are describing an MDP. If # we run a check on P and R to make sure they are describing an MDP. If # an exception isn't raised then they are assumed to be correct. # an exception isn't raised then they are assumed to be correct. check(transitions, reward) check(transitions, reward) ... @@ -744,7 +750,7 @@ class PolicyIteration(MDP): ... @@ -744,7 +750,7 @@ class PolicyIteration(MDP): def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0): def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0): """""" """""" MDP.__init__(self, transitions, reward, discount, max_iter) MDP.__init__(self, transitions, reward, discount, None, max_iter) if policy0 == None: if policy0 == None: # initialise the policy to the one which maximises the expected # initialise the policy to the one which maximises the expected ... @@ -913,7 +919,7 @@ class PolicyIteration(MDP): ... @@ -913,7 +919,7 @@ class PolicyIteration(MDP): Ppolicy, Rpolicy = self.computePpolicyPRpolicy() Ppolicy, Rpolicy = self.computePpolicyPRpolicy() # V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR # V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR self.V = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy) , Rpolicy) self.V = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy) def iterate(self): def iterate(self): """Run the policy iteration algorithm.""" """Run the policy iteration algorithm.""" ... @@ -961,7 +967,7 @@ class PolicyIteration(MDP): ... @@ -961,7 +967,7 @@ class PolicyIteration(MDP): self.V = tuple(array(self.V).reshape(self.S).tolist()) self.V = tuple(array(self.V).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) class PolicyIterationModified(MDP): class PolicyIterationModified(PolicyIteration): """Resolution of discounted MDP with policy iteration algorithm """Resolution of discounted MDP with policy iteration algorithm Arguments Arguments ... @@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP): ... @@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10): """""" """""" MDP.__init__(self, transitions, reward, discount, max_iter) PolicyIteration.__init__(self, transitions, reward, discount, None, max_iter, 1) # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will # check it here if type(epsilon) in (int, float): if epsilon <= 0: if epsilon <= 0: raise ValueError("epsilon must be greater than 0") raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") else: raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\ "number greater than zero.") # computation of threshold of variation for V for an epsilon-optimal policy # computation of threshold of variation for V for an epsilon-optimal policy if self.discount != 1: if self.discount != 1: ... @@ -1128,7 +1140,7 @@ class QLearning(MDP): ... @@ -1128,7 +1140,7 @@ class QLearning(MDP): raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000") raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000") # after this n_iter will be known as self.max_iter # after this n_iter will be known as self.max_iter MDP.__init__(self, transitions, reward, discount, n_iter) MDP.__init__(self, transitions, reward, discount, None, n_iter) # Initialisations # Initialisations self.Q = zeros((self.S, self.A)) self.Q = zeros((self.S, self.A)) ... @@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP): ... @@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP): def __init__(self, transitions, reward, epsilon=0.01, max_iter=1000): def __init__(self, transitions, reward, epsilon=0.01, max_iter=1000): MDP.__init__(self, transitions, reward, None, max_iter) MDP.__init__(self, transitions, reward, None, epsilon, max_iter) if epsilon <= 0: self.epsilon = epsilon print('MDP Toolbox ERROR: epsilon must be upper than 0') self.discount = 1 self.U = zeros(self.S, 1) self.V = matrix(zeros((self.S, 1))) self.gain = self.U[self.S] self.gain = 0 # self.U[self.S] self.average_reward = None def iterate(self): def iterate(self): """""" """""" ... @@ -1259,30 +1273,34 @@ class RelativeValueIteration(MDP): ... @@ -1259,30 +1273,34 @@ class RelativeValueIteration(MDP): self.iter = self.iter + 1; self.iter = self.iter + 1; Unext, policy = self.bellmanOperator(self.P, self.R, 1, self.U) self.policy, Vnext = self.bellmanOperator() Unext = Unext - self.gain Vnext = Vnext - self.gain variation = getSpan(Unext - self.U) variation = getSpan(Vnext - self.V) if self.verbose: if self.verbose: print(" %s %s" % (self.iter, variation)) print(" %s %s" % (self.iter, variation)) if variation < self.epsilon: if variation < self.epsilon: done = True done = True average_reward = self.gain + min(Unext - self.U) self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: if self.verbose: print('MDP Toolbox : iterations stopped, epsilon-optimal policy found') print('MDP Toolbox : iterations stopped, epsilon-optimal policy found') elif self.iter == self.max_iter: elif self.iter == self.max_iter: done = True done = True average_reward = self.gain + min(Unext - self.U); self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: if self.verbose: print('MDP Toolbox : iterations stopped by maximum number of iteration condition') print('MDP Toolbox : iterations stopped by maximum number of iteration condition') self.U = Unext self.V = Vnext self.gain = self.U(self.S) self.gain = float(self.V[self.S - 1]) self.time = time() - self.time self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) class ValueIteration(MDP): class ValueIteration(MDP): """ """ Solves discounted MDP with the value iteration algorithm. Solves discounted MDP with the value iteration algorithm. ... @@ -1401,21 +1419,18 @@ class ValueIteration(MDP): ... @@ -1401,21 +1419,18 @@ class ValueIteration(MDP): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0): """Resolution of discounted MDP with value iteration algorithm.""" """Resolution of discounted MDP with value iteration algorithm.""" MDP.__init__(self, transitions, reward, discount, max_iter) MDP.__init__(self, transitions, reward, discount, epsilon, max_iter) # initialization of optional arguments # initialization of optional arguments if (initial_value == 0): if initial_value == 0: self.V = matrix(zeros((self.S, 1))) self.V = matrix(zeros((self.S, 1))) else: else: if (not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S))): if not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S)): raise ValueError("PyMDPtoolbox: The initial value must be a vector of length S") raise ValueError("PyMDPtoolbox: The initial value must be a vector of length S") else: else: self.V = matrix(initial_value) self.V = matrix(initial_value) if epsilon <= 0: if self.discount < 1: raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") if (self.discount < 1): # compute a bound for the number of iterations and update the # compute a bound for the number of iterations and update the # stored value of self.max_iter # stored value of self.max_iter self.boundIter(epsilon) self.boundIter(epsilon) ... @@ -1464,7 +1479,7 @@ class ValueIteration(MDP): ... @@ -1464,7 +1479,7 @@ class ValueIteration(MDP): max_iter = log( (epsilon * (1 - self.discount) / self.discount) / getSpan(value - Vprev) ) / log(self.discount * k) max_iter = log( (epsilon * (1 - self.discount) / self.discount) / getSpan(value - Vprev) ) / log(self.discount * k) #self.V = Vprev #self.V = Vprev self.max_iter = ceil(max_iter) self.max_iter = int(ceil(max_iter)) def iterate(self): def iterate(self): """ """ ... ...
 ... @@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012 ... @@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012 """ """ from mdp import check, checkSquareStochastic, exampleForest, exampleRand, MDP from mdp import check, checkSquareStochastic, exampleForest, exampleRand, MDP from mdp import PolicyIteration, ValueIteration, ValueIterationGS from mdp import PolicyIteration, RelativeValueIteration, ValueIteration from mdp import ValueIterationGS from numpy import absolute, array, eye, matrix, zeros from numpy import absolute, array, eye, matrix, zeros from numpy.random import rand from numpy.random import rand ... @@ -18,6 +19,13 @@ STATES = 10 ... @@ -18,6 +19,13 @@ STATES = 10 ACTIONS = 3 ACTIONS = 3 SMALLNUM = 10e-12 SMALLNUM = 10e-12 # Arrays P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]]) R = array([[5, 10], [-1, 2]]) Pf, Rf = exampleForest() Pr, Rr = exampleRand(STATES, ACTIONS) Prs, Rrs = exampleRand(STATES, ACTIONS, is_sparse=True) # check: square, stochastic and non-negative ndarrays # check: square, stochastic and non-negative ndarrays def test_check_square_stochastic_nonnegative_array_1(): def test_check_square_stochastic_nonnegative_array_1(): ... @@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse(): ... @@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse(): assert checkSquareStochastic(P) == None assert checkSquareStochastic(P) == None # exampleForest # exampleForest Pf, Rf = exampleForest() def test_exampleForest_P_shape(): def test_exampleForest_P_shape(): assert (Pf == array([[[0.1, 0.9, 0.0], assert (Pf == array([[[0.1, 0.9, 0.0], ... @@ -151,8 +158,6 @@ def test_exampleForest_check(): ... @@ -151,8 +158,6 @@ def test_exampleForest_check(): # exampleRand # exampleRand Pr, Rr = exampleRand(STATES, ACTIONS) def test_exampleRand_dense_P_shape(): def test_exampleRand_dense_P_shape(): assert (Pr.shape == (ACTIONS, STATES, STATES)) assert (Pr.shape == (ACTIONS, STATES, STATES)) ... @@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape(): ... @@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape(): def test_exampleRand_dense_check(): def test_exampleRand_dense_check(): assert check(Pr, Rr) == None assert check(Pr, Rr) == None Prs, Rrs = exampleRand(STATES, ACTIONS, is_sparse=True) def test_exampleRand_sparse_P_shape(): def test_exampleRand_sparse_P_shape(): assert (Prs.shape == (ACTIONS, )) assert (Prs.shape == (ACTIONS, )) ... @@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape(): ... @@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape(): def test_exampleRand_sparse_check(): def test_exampleRand_sparse_check(): assert check(Prs, Rrs) == None assert check(Prs, Rrs) == None P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]]) R = array([[5, 10], [-1, 2]]) # MDP # MDP def test_MDP_P_R_1(): def test_MDP_P_R_1(): ... @@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest(): ... @@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest(): # ValueIterationGS # ValueIterationGS def test_ValueIterationGS_boundIter_exampleForest(): a = ValueIterationGS(Pf, Rf, 0.9) itr = 39 assert (a.max_iter == itr) def test_ValueIterationGS_exampleForest(): def test_ValueIterationGS_exampleForest(): a = ValueIterationGS(Pf, Rf, 0.9) a = ValueIterationGS(Pf, Rf, 0.9) p = matrix('0 0 0') p = matrix('0 0 0') ... @@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest(): ... @@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest(): assert a.iter == itr assert a.iter == itr assert (absolute(array(a.V) - v) < SMALLNUM).all() assert (absolute(array(a.V) - v) < SMALLNUM).all() # RelativeValueIteration def test_RelativeValueIteration_exampleForest(): a = RelativeValueIteration(Pf, Rf) itr = 4 p = matrix('0 0 0') v = matrix('-4.360000000000000 -0.760000000000000 3.240000000000000') a.iterate() assert (array(a.policy) == p).all() assert a.iter == itr assert (absolute(array(a.V) - v) < SMALLNUM).all() #def test_JacksCarRental(): #def test_JacksCarRental(): # S = 21 ** 2 # S = 21 ** 2 # A = 11 # A = 11 ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!