Commit 5f7f035b by Steven Cordwell

### fixes and unittest fixes for ValueIteration and PolicyIteration classes

parent fa69c0d3
 ... ... @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from numpy import absolute, array, diag, matrix, mean, mod, multiply, ndarray from numpy import ones, zeros from numpy import nonzero, ones, zeros from numpy.random import rand from math import ceil, log, sqrt from random import randint, random ... ... @@ -419,6 +419,13 @@ def exampleRand(S, A, is_sparse=False, mask=None): return (P, R) def getSpan(self, W): """Returns the span of W sp(W) = max W(s) - min W(s) """ return (W.max() - W.min()) class MDP(object): """The Markov Decision Problem Toolbox.""" ... ... @@ -502,27 +509,31 @@ class MDP(object): Ppolicy(SxS) = transition matrix for policy PRpolicy(S) = reward matrix for policy """ Ppolicy = zeros(()) PRpolicy = zeros(()) for a in range(self.A): # avoid looping over S ind = find(self.policy == a) # the rows that use action a if not isempty(ind): if iscell(P): Ppolicy[ind,:] = self.P[a][ind,:] else: Ppolicy[ind,:] = self.P[ind,:,a] Ppolicy = matrix(zeros((self.S, self.S))) Rpolicy = matrix(zeros((self.S, 1))) for aa in range(self.A): # avoid looping over S # the rows that use action a. .getA1() is used to make sure that # ind is a 1 dimensional vector ind = nonzero(self.policy == aa)[0].getA1() if ind.size > 0: # if no rows use action a, then no point continuing Ppolicy[ind, :] = self.P[aa][ind, :] PR = self.computePR() PRpolicy[ind, 1] = PR[ind, a] # self.R cannot be sparse with the code in its current condition #PR = self.computePR() # an apparently uneeded line, and # perhaps harmful in this implementation c.f. # mdp_computePpolicyPRpolicy.m Rpolicy[ind] = self.R[ind, aa] # self.R cannot be sparse with the code in its current condition, but # it should be possible in the future. Also, if R is so big that its # a good idea to use a sparse matrix for it, then converting PRpolicy # from a dense to sparse matrix doesn't seem very memory efficient if type(self.R) is sparse: PRpolicy = sparse(PRpolicy) Rpolicy = sparse(Rpolicy) #self.Ppolicy = Ppolicy #self.Rpolicy = PRpolicy return (Ppolicy, PRpolicy) #self.Rpolicy = Rpolicy return (Ppolicy, Rpolicy) def computePR(self, P, R): """Computes the reward for the system in one state chosing an action ... ... @@ -584,13 +595,6 @@ class MDP(object): """ raise NotImplementedError("You should create an iterate() method.") def getSpan(self, W): """Returns the span of W sp(W) = max W(s) - min W(s) """ return (W.max() - W.min()) def setSilent(self): """Ask for running resolution functions of the MDP Toolbox in silent mode. ... ... @@ -804,6 +808,8 @@ class PolicyIteration(MDP): if eval_type in (0, "matrix"): from numpy.linalg import solve from scipy.sparse import eye self.speye = eye self.lin_eq = solve self.eval_type = "matrix" elif eval_type in (1, "iterative"): ... ... @@ -844,11 +850,11 @@ class PolicyIteration(MDP): epsilon-optimum value function found or maximum number of iterations reached. """ if V0 == 0: Vpolicy = zeros(self.S, 1) policy_V = zeros((self.S, 1)) else: raise NotImplementedError("evalPolicyIterative: case V0 != 0 not implemented. Use V0=0 instead.") Ppolicy, PRpolicy = self.computePpolicyPRpolicy(self.P, self.R, self.policy) policy_P, policy_R = self.computePpolicyPRpolicy() if self.verbose: print(' Iteration V_variation') ... ... @@ -857,21 +863,24 @@ class PolicyIteration(MDP): done = False while not done: itr = itr + 1 Vprev = Vpolicy Vpolicy = PRpolicy + self.discount * Ppolicy * Vprev variation = max(abs(Vpolicy - Vprev)) Vprev = policy_V policy_V = policy_R + self.discount * policy_P * Vprev variation = absolute(policy_V - Vprev).max() if self.verbose: print(' %s %s') % (itr, variation) if variation < ((1 - self.discount) / self.discount) * epsilon: # to ensure |Vn - Vpolicy| < epsilon done = True if self.verbose: print('MDP Toolbox: iterations stopped, epsilon-optimal value function') print('PyMDPtoolbox: iterations stopped, epsilon-optimal value function') elif itr == max_iter: done = True if self.verbose: print('MDP Toolbox: iterations stopped by maximum number of iteration condition') print('PyMDPtoolbox: iterations stopped by maximum number of iteration condition') self.value = Vpolicy self.value = policy_V def evalPolicyMatrix(self): """Evaluation of the value function of a policy ... ... @@ -894,12 +903,12 @@ class PolicyIteration(MDP): Vpolicy(S) = value function of the policy """ Ppolicy, PRpolicy = self.computePpolicyPRpolicy(self.P, self.R, self.policy) Ppolicy, Rpolicy = self.computePpolicyPRpolicy() # V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR self.value = self.lin_eq((speye(self.S, self.S) - self.discount * Ppolicy) , PRpolicy) self.value = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy) , Rpolicy) def iterate(self): """""" """Run the policy iteration algorithm.""" if self.verbose: print(' Iteration Number_of_different_actions') ... ... @@ -927,8 +936,16 @@ class PolicyIteration(MDP): if self.verbose: print(' %s %s') % (self.iter, n_different) if (policy_next == self.policy).all() or (self.iter == self.max_iter): if n_different == 0: done = True if self.verbose: print("...iterations stopped, unchanging policy found") elif (self.iter == self.max_iter): done = True if self.verbose: print("...iterations stopped by maximum number of iteration condition") else: self.policy = policy_next self.time = time() - self.time ... ... @@ -1009,7 +1026,7 @@ class PolicyIterationModified(MDP): Vnext, policy = self.bellmanOperator(self.P, self.PR, self.discount, self.V) #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy); variation = self.getSpan(Vnext - self.value); variation = getSpan(Vnext - self.value); if self.verbose: print(" %s %s" % (self.iter, variation)) ... ... @@ -1237,7 +1254,7 @@ class RelativeValueIteration(MDP): Unext, policy = self.bellmanOperator(self.P, self.R, 1, self.U) Unext = Unext - self.gain variation = self.getSpan(Unext - self.U) variation = getSpan(Unext - self.U) if self.verbose: print(" %s %s" % (self.iter, variation)) ... ... @@ -1382,8 +1399,8 @@ class ValueIteration(MDP): if (initial_value == 0): self.value = matrix(zeros((self.S, 1))) else: if (initial_value.size != self.S): raise ValueError("The initial value must be length S") if (not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S))): raise ValueError("The initial value must be a vector of length S") else: self.value = matrix(initial_value) ... ... @@ -1431,10 +1448,10 @@ class ValueIteration(MDP): k = 1 - h.sum() Vprev = self.value self.bellmanOperator() null, value = self.bellmanOperator() # p 201, Proposition 6.6.5 max_iter = log( (epsilon * (1 - self.discount) / self.discount) / self.getSpan(self.value - Vprev) ) / log(self.discount * k) self.value = Vprev max_iter = log( (epsilon * (1 - self.discount) / self.discount) / getSpan(value - Vprev) ) / log(self.discount * k) #self.value = Vprev self.max_iter = ceil(max_iter) ... ... @@ -1458,7 +1475,7 @@ class ValueIteration(MDP): # The values, based on Q. For the function "max()": the option # "axis" means the axis along which to operate. In this case it # finds the maximum of the the rows. (Operates along the columns?) variation = self.getSpan(self.value - Vprev) variation = getSpan(self.value - Vprev) if self.verbose: print(" %s %s" % (self.iter, variation)) ... ... @@ -1566,7 +1583,7 @@ class ValueIterationGS(ValueIteration): Q[a] = self.R[s,a] + self.discount * self.P[a][s,:] * self.value self.value[s] = max(Q) variation = self.getSpan(V - Vprev) variation = getSpan(V - Vprev) if self.verbose: print(" %s %s" % (self.iter, variation)) ... ...
 ... ... @@ -172,10 +172,10 @@ R = array([[5, 10], [-1, 2]]) # MDP def test_MDP_P_R_1(): P1 = zeros((ACTIONS, ), dtype=object) P1[0] = matrix([[0.5, 0.5],[0.8, 0.2]]) P1[1] = matrix([[0, 1],[0.1, 0.9]]) R1 = matrix([[5, 10], [-1, 2]]) P1 = zeros((2, ), dtype=object) P1[0] = matrix('0.5 0.5; 0.8 0.2') P1[1] = matrix('0 1; 0.1 0.9') R1 = matrix('5 10; -1 2') a = MDP(P, R, 0.9, 0.01) assert a.P.dtype == P1.dtype assert a.R.dtype == R1.dtype ... ... @@ -185,10 +185,10 @@ def test_MDP_P_R_1(): def test_MDP_P_R_2(): R = array([[[5, 10], [-1, 2]], [[1, 2], [3, 4]]]) P1 = zeros((ACTIONS, ), dtype=object) P1[0] = matrix([[0.5, 0.5],[0.8, 0.2]]) P1[1] = matrix([[0, 1],[0.1, 0.9]]) R1 = matrix([[7.5, 2], [-0.4, 3.9]]) P1 = zeros((2, ), dtype=object) P1[0] = matrix('0.5 0.5; 0.8 0.2') P1[1] = matrix('0 1; 0.1 0.9') R1 = matrix('7.5 2; -0.4 3.9') a = MDP(P, R, 0.9, 0.01) assert type(a.P) == type(P1) assert type(a.R) == type(R1) ... ... @@ -201,7 +201,7 @@ def test_MDP_P_R_2(): def test_MDP_P_R_3(): P = array([[[0.6116, 0.3884],[0, 1]],[[0.6674, 0.3326],[0, 1]]]) R = array([[[-0.2433, 0.7073],[0, 0.1871]],[[-0.0069, 0.6433],[0, 0.2898]]]) PR = matrix([[0.12591304, 0.20935652], [0.1871, 0.2898]]) PR = matrix('0.12591304 0.20935652; 0.1871 0.2898') a = MDP(P, R, 0.9, 0.01) assert (absolute(a.R - PR) < SMALLNUM).all() ... ... @@ -229,16 +229,73 @@ def test_ValueIteration_exampleForest(): def test_PolicyIteration_init_policy0(): a = PolicyIteration(P, R, 0.9) p = array((1, 1)).reshape(2, 1) assert (absolute(a.policy - p) < SMALLNUM).all() def test_PolicyIteration(): PolicyIteration(P, R, 0.9) #inst.iterate() #assert (abs(inst.value[0] - 42.4419) < 0.001) #assert (abs(inst.value[1] - 36.0465) < 0.001) #assert (inst.policy == (1, 0)) #assert (inst.iter == 2) p = matrix('1; 1') assert (a.policy == p).all() def test_PolicyIteration_init_policy0_exampleForest(): P, R = exampleForest() a = PolicyIteration(P, R, 0.9) p = matrix('0; 1; 0') assert (a.policy == p).all() def test_PolicyIteration_computePpolicyPRpolicy_exampleForest(): P, R = exampleForest() a = PolicyIteration(P, R, 0.9) P1 = matrix('0.1 0.9 0; 1 0 0; 0.1 0 0.9') R1 = matrix('0; 1; 4') Ppolicy, Rpolicy = a.computePpolicyPRpolicy() assert (absolute(Ppolicy - P1) < SMALLNUM).all() assert (absolute(Rpolicy - R1) < SMALLNUM).all() def test_PolicyIteration_evalPolicyIterative_exampleForest(): P, R = exampleForest() v0 = matrix('0; 0; 0') v1 = matrix('4.47504640074458; 5.02753258879703; 23.17234211944304') p = matrix('0; 1; 0') a = PolicyIteration(P, R, 0.9) assert (absolute(a.value - v0) < SMALLNUM).all() a.evalPolicyIterative() assert (absolute(a.value - v1) < SMALLNUM).all() assert (a.policy == p).all() def test_PolicyIteration_evalPolicyIterative_bellmanOperator_exampleForest(): P, R = exampleForest() v = matrix('4.47504640074458; 5.02753258879703; 23.17234211944304') p = matrix('0; 0; 0') a = PolicyIteration(P, R, 0.9) a.evalPolicyIterative() policy, value = a.bellmanOperator() assert (policy == p).all() assert (absolute(a.value - v) < SMALLNUM).all() def test_PolicyIteration_iterative_exampleForest(): P, R = exampleForest() a = PolicyIteration(P, R, 0.9, eval_type=1) V = matrix('26.2439058351861 29.4839058351861 33.4839058351861') p = matrix('0 0 0') itr = 2 a.iterate() assert (absolute(array(a.value) - V) < SMALLNUM).all() assert (array(a.policy) == p).all() assert a.iter == itr def test_PolicyIteration_evalPolicyMatrix_exampleForest(): P, R = exampleForest() v_pol = matrix('4.47513812154696; 5.02762430939227; 23.17243384704857') a = PolicyIteration(P, R, 0.9) a.evalPolicyMatrix() assert (absolute(a.value - v_pol) < SMALLNUM).all() def test_PolicyIteration_matrix_exampleForest(): P, R = exampleForest() a = PolicyIteration(P, R, 0.9) V = matrix('26.2440000000000 29.4840000000000 33.4840000000000') p = matrix('0 0 0') itr = 2 a.iterate() assert (absolute(array(a.value) - V) < SMALLNUM).all() assert (array(a.policy) == p).all() assert a.iter == itr #def test_JacksCarRental(): # S = 21 ** 2 ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment