Commit 0660caca by Steven Cordwell

### completed RelativeValueIteration class to a useful state

parent 4f17fb5f
 ... ... @@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1): array([[[ 0.1, 0.9, 0. ], [ 0.1, 0. , 0.9], [ 0.1, 0. , 0.9]], [[ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ]]]) ... ... @@ -429,12 +428,12 @@ def getSpan(W): class MDP(object): """The Markov Decision Problem Toolbox.""" def __init__(self, transitions, reward, discount, max_iter): def __init__(self, transitions, reward, discount, epsilon, max_iter): """""" # if the discount is None then the algorithm is assumed to not use it # in its computations if (type(discount) is int) or (type(discount) is float): if type(discount) in (int, float): if (discount <= 0) or (discount > 1): raise ValueError(mdperr["discount_rng"]) else: ... ... @@ -448,8 +447,8 @@ class MDP(object): # if the max_iter is None then the algorithm is assumed to not use it # in its computations if (type(max_iter) is int) or (type(max_iter) is float): if (max_iter <= 0): if type(max_iter) in (int, float): if max_iter <= 0: raise ValueError(mdperr["maxi_min"]) else: self.max_iter = max_iter ... ... @@ -457,6 +456,13 @@ class MDP(object): raise ValueError("PyMDPtoolbox: max_iter must be a positive real "\ "number greater than zero.") if type(epsilon) in (int, float): if epsilon <= 0: raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") elif not epsilon is None: raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\ "number greater than zero.") # we run a check on P and R to make sure they are describing an MDP. If # an exception isn't raised then they are assumed to be correct. check(transitions, reward) ... ... @@ -744,7 +750,7 @@ class PolicyIteration(MDP): def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0): """""" MDP.__init__(self, transitions, reward, discount, max_iter) MDP.__init__(self, transitions, reward, discount, None, max_iter) if policy0 == None: # initialise the policy to the one which maximises the expected ... ... @@ -913,7 +919,7 @@ class PolicyIteration(MDP): Ppolicy, Rpolicy = self.computePpolicyPRpolicy() # V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR self.V = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy) , Rpolicy) self.V = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy) def iterate(self): """Run the policy iteration algorithm.""" ... ... @@ -961,7 +967,7 @@ class PolicyIteration(MDP): self.V = tuple(array(self.V).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) class PolicyIterationModified(MDP): class PolicyIterationModified(PolicyIteration): """Resolution of discounted MDP with policy iteration algorithm Arguments ... ... @@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10): """""" MDP.__init__(self, transitions, reward, discount, max_iter) PolicyIteration.__init__(self, transitions, reward, discount, None, max_iter, 1) if epsilon <= 0: raise ValueError("epsilon must be greater than 0") # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will # check it here if type(epsilon) in (int, float): if epsilon <= 0: raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") else: raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\ "number greater than zero.") # computation of threshold of variation for V for an epsilon-optimal policy if self.discount != 1: ... ... @@ -1128,7 +1140,7 @@ class QLearning(MDP): raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000") # after this n_iter will be known as self.max_iter MDP.__init__(self, transitions, reward, discount, n_iter) MDP.__init__(self, transitions, reward, discount, None, n_iter) # Initialisations self.Q = zeros((self.S, self.A)) ... ... @@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP): def __init__(self, transitions, reward, epsilon=0.01, max_iter=1000): MDP.__init__(self, transitions, reward, None, max_iter) MDP.__init__(self, transitions, reward, None, epsilon, max_iter) if epsilon <= 0: print('MDP Toolbox ERROR: epsilon must be upper than 0') self.U = zeros(self.S, 1) self.gain = self.U[self.S] self.epsilon = epsilon self.discount = 1 self.V = matrix(zeros((self.S, 1))) self.gain = 0 # self.U[self.S] self.average_reward = None def iterate(self): """""" ... ... @@ -1259,29 +1273,33 @@ class RelativeValueIteration(MDP): self.iter = self.iter + 1; Unext, policy = self.bellmanOperator(self.P, self.R, 1, self.U) Unext = Unext - self.gain self.policy, Vnext = self.bellmanOperator() Vnext = Vnext - self.gain variation = getSpan(Unext - self.U) variation = getSpan(Vnext - self.V) if self.verbose: print(" %s %s" % (self.iter, variation)) if variation < self.epsilon: done = True average_reward = self.gain + min(Unext - self.U) self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: print('MDP Toolbox : iterations stopped, epsilon-optimal policy found') elif self.iter == self.max_iter: done = True average_reward = self.gain + min(Unext - self.U); self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: print('MDP Toolbox : iterations stopped by maximum number of iteration condition') self.U = Unext self.gain = self.U(self.S) self.V = Vnext self.gain = float(self.V[self.S - 1]) self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) class ValueIteration(MDP): """ ... ... @@ -1401,21 +1419,18 @@ class ValueIteration(MDP): def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0): """Resolution of discounted MDP with value iteration algorithm.""" MDP.__init__(self, transitions, reward, discount, max_iter) MDP.__init__(self, transitions, reward, discount, epsilon, max_iter) # initialization of optional arguments if (initial_value == 0): if initial_value == 0: self.V = matrix(zeros((self.S, 1))) else: if (not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S))): if not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S)): raise ValueError("PyMDPtoolbox: The initial value must be a vector of length S") else: self.V = matrix(initial_value) if epsilon <= 0: raise ValueError("PyMDPtoolbox: epsilon must be greater than 0") if (self.discount < 1): if self.discount < 1: # compute a bound for the number of iterations and update the # stored value of self.max_iter self.boundIter(epsilon) ... ... @@ -1464,7 +1479,7 @@ class ValueIteration(MDP): max_iter = log( (epsilon * (1 - self.discount) / self.discount) / getSpan(value - Vprev) ) / log(self.discount * k) #self.V = Vprev self.max_iter = ceil(max_iter) self.max_iter = int(ceil(max_iter)) def iterate(self): """ ... ...
 ... ... @@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012 """ from mdp import check, checkSquareStochastic, exampleForest, exampleRand, MDP from mdp import PolicyIteration, ValueIteration, ValueIterationGS from mdp import PolicyIteration, RelativeValueIteration, ValueIteration from mdp import ValueIterationGS from numpy import absolute, array, eye, matrix, zeros from numpy.random import rand ... ... @@ -18,6 +19,13 @@ STATES = 10 ACTIONS = 3 SMALLNUM = 10e-12 # Arrays P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]]) R = array([[5, 10], [-1, 2]]) Pf, Rf = exampleForest() Pr, Rr = exampleRand(STATES, ACTIONS) Prs, Rrs = exampleRand(STATES, ACTIONS, is_sparse=True) # check: square, stochastic and non-negative ndarrays def test_check_square_stochastic_nonnegative_array_1(): ... ... @@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse(): assert checkSquareStochastic(P) == None # exampleForest Pf, Rf = exampleForest() def test_exampleForest_P_shape(): assert (Pf == array([[[0.1, 0.9, 0.0], ... ... @@ -151,8 +158,6 @@ def test_exampleForest_check(): # exampleRand Pr, Rr = exampleRand(STATES, ACTIONS) def test_exampleRand_dense_P_shape(): assert (Pr.shape == (ACTIONS, STATES, STATES)) ... ... @@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape(): def test_exampleRand_dense_check(): assert check(Pr, Rr) == None Prs, Rrs = exampleRand(STATES, ACTIONS, is_sparse=True) def test_exampleRand_sparse_P_shape(): assert (Prs.shape == (ACTIONS, )) ... ... @@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape(): def test_exampleRand_sparse_check(): assert check(Prs, Rrs) == None P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]]) R = array([[5, 10], [-1, 2]]) # MDP def test_MDP_P_R_1(): ... ... @@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest(): # ValueIterationGS def test_ValueIterationGS_boundIter_exampleForest(): a = ValueIterationGS(Pf, Rf, 0.9) itr = 39 assert (a.max_iter == itr) def test_ValueIterationGS_exampleForest(): a = ValueIterationGS(Pf, Rf, 0.9) p = matrix('0 0 0') ... ... @@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest(): assert a.iter == itr assert (absolute(array(a.V) - v) < SMALLNUM).all() # RelativeValueIteration def test_RelativeValueIteration_exampleForest(): a = RelativeValueIteration(Pf, Rf) itr = 4 p = matrix('0 0 0') v = matrix('-4.360000000000000 -0.760000000000000 3.240000000000000') a.iterate() assert (array(a.policy) == p).all() assert a.iter == itr assert (absolute(array(a.V) - v) < SMALLNUM).all() #def test_JacksCarRental(): # S = 21 ** 2 # A = 11 ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!