Commit 361b42e0 by Steven Cordwell

### changed PolicyIteration to be more like original

parent 60e89bf1
 ... ... @@ -699,68 +699,54 @@ class PolicyIteration(MDP): """ def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0): def __init__(self, transitions, reward, discount, policy0, max_iter=1000, eval_type=0): """""" MDP.__init__(self) self.check(transitions, reward) self.S = transitions.shape[1] self.A = transitions.shape[0] self.P = transitions if (size(policy0,1) != S or any(mod(policy0, 1)) or any(policy0 < 1) or any(policy0 > S)): raise ValueError('MDP Toolbox ERROR: policy0 must a (1xS) vector with integer from 1 to S') self.R = reward #self.computePR(transitions, reward) if (initial_value == 0): self.value = zeros((self.S)) #self.value = matrix(zeros((self.S, 1))) else: if (len(initial_value) != self.S): raise ValueError("The initial value must be length S") self.value = matrix(initial_value) MDP.__init__(self, transitions, reward, discount, max_iter) self.policy = randi(0, self.A, self.S) self.discount = discount self.max_iter = max_iter self.value = matrix(zeros((self.S, 1))) self.iter = 0 # initialise the policy to the one which maximises the expected # immediate reward self.bellmanOperator() def evalPolicyMatrix(self): """""" pass def iterate(self): """""" done = False stop_criterion = 0.01 if self.verbose: print(' Iteration Number_of_different_actions') self.time = time() while not done: stop = False while not stop: change = 0 for s in range(self.S): v = self.value[s] a = self.policy[s] self.value[s] = (self.P[a, s, :] * (self.R[a, s, :] + (self.discount * self.value))).sum() change = max(change, abs(v - self.value[s])) if change < stop_criterion: stop = True self.iter = self.iter + 1 policy_stable = True for s in range(self.S): b = self.policy[s] self.policy[s] = (self.P[:, s, :] * (self.R[:, s, :] + (self.discount * self.value))).sum(1).argmax() if b != self.policy[s]: policy_stable = False if eval_type == 0: self.value = self.evalPolicyMatrix() else: self.value = self.evalPolicyIterative() if policy_stable: policy_prev = self.policy self.bellmanOperator() n_different = (policy != policy_prev).sum() if self.verbose: print(' %s %s') % (self.iter, n_different) if (policy == policy_prev).all() or (self.iter == self.max_iter): done = True self.time = time() - self.time # store value and policy as tuples self.value = tuple(array(self.value).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) ... ...
 ... ... @@ -5,89 +5,90 @@ Created on Sun May 27 23:16:57 2012 @author: - """ from mdp import exampleForest, exampleRand, MDP, PolicyIteration, ValueIteration from numpy import array, eye, matrix, zeros from numpy.random import rand from scipy.sparse import eye as speye from scipy.sparse import csr_matrix as sparse from mdp import exampleForest, exampleRand, PolicyIteration, ValueIteration from numpy import array #from numpy import array, eye, matrix, zeros #from numpy.random import rand #from scipy.sparse import eye as speye #from scipy.sparse import csr_matrix as sparse #from scipy.stats.distributions import poisson inst = MDP() #inst = MDP() # STATES = 10 ACTIONS = 3 # check: square, stochastic and non-negative def test_check_square_stochastic_nonnegative_array(): P = zeros((ACTIONS, STATES, STATES)) R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): P[a, :, :] = eye(STATES) R[:, a] = rand(STATES) inst.check(P, R) # check: square, stochastic and non-negative object arrays def test_check_square_stochastic_nonnegative_object_array(): P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): P[a] = eye(STATES) R[:, a] = rand(STATES) inst.check(P, R) def test_check_square_stochastic_nonnegative_object_matrix(): P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): P[a] = matrix(eye(STATES)) R[:, a] = rand(STATES) inst.check(P, R) def test_check_square_stochastic_nonnegative_object_sparse(): P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): P[a] = speye(STATES, STATES).tocsr() R[:, a] = rand(STATES) inst.check(P, R) # checkSquareStochastic: square, stochastic and non-negative def test_checkSquareStochastic_square_stochastic_nonnegative_array(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() assert inst.checkSquareStochastic(P) == None def test_checkSquareStochastic_square_stochastic_nonnegative_matrix(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() P = matrix(P) assert inst.checkSquareStochastic(P) == None def test_checkSquareStochastic_square_stochastic_nonnegative_sparse(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() P = sparse(P) assert inst.checkSquareStochastic(P) == None # checkSquareStochastic: eye def test_checkSquareStochastic_eye_array(): P = eye(STATES) assert inst.checkSquareStochastic(P) == None def test_checkSquareStochastic_eye_matrix(): P = matrix(eye(STATES)) assert inst.checkSquareStochastic(P) == None def test_checkSquareStochastic_eye_sparse(): P = speye(STATES, STATES).tocsr() assert inst.checkSquareStochastic(P) == None # ## check: square, stochastic and non-negative # #def test_check_square_stochastic_nonnegative_array(): # P = zeros((ACTIONS, STATES, STATES)) # R = zeros((STATES, ACTIONS)) # for a in range(ACTIONS): # P[a, :, :] = eye(STATES) # R[:, a] = rand(STATES) # inst.check(P, R) # ## check: square, stochastic and non-negative object arrays # #def test_check_square_stochastic_nonnegative_object_array(): # P = zeros((ACTIONS, ), dtype=object) # R = zeros((STATES, ACTIONS)) # for a in range(ACTIONS): # P[a] = eye(STATES) # R[:, a] = rand(STATES) # inst.check(P, R) # #def test_check_square_stochastic_nonnegative_object_matrix(): # P = zeros((ACTIONS, ), dtype=object) # R = zeros((STATES, ACTIONS)) # for a in range(ACTIONS): # P[a] = matrix(eye(STATES)) # R[:, a] = rand(STATES) # inst.check(P, R) # #def test_check_square_stochastic_nonnegative_object_sparse(): # P = zeros((ACTIONS, ), dtype=object) # R = zeros((STATES, ACTIONS)) # for a in range(ACTIONS): # P[a] = speye(STATES, STATES).tocsr() # R[:, a] = rand(STATES) # inst.check(P, R) # ## checkSquareStochastic: square, stochastic and non-negative # #def test_checkSquareStochastic_square_stochastic_nonnegative_array(): # P = rand(STATES, STATES) # for s in range(STATES): # P[s, :] = P[s, :] / P[s, :].sum() # assert inst.checkSquareStochastic(P) == None # #def test_checkSquareStochastic_square_stochastic_nonnegative_matrix(): # P = rand(STATES, STATES) # for s in range(STATES): # P[s, :] = P[s, :] / P[s, :].sum() # P = matrix(P) # assert inst.checkSquareStochastic(P) == None # #def test_checkSquareStochastic_square_stochastic_nonnegative_sparse(): # P = rand(STATES, STATES) # for s in range(STATES): # P[s, :] = P[s, :] / P[s, :].sum() # P = sparse(P) # assert inst.checkSquareStochastic(P) == None # ## checkSquareStochastic: eye # #def test_checkSquareStochastic_eye_array(): # P = eye(STATES) # assert inst.checkSquareStochastic(P) == None # #def test_checkSquareStochastic_eye_matrix(): # P = matrix(eye(STATES)) # assert inst.checkSquareStochastic(P) == None # #def test_checkSquareStochastic_eye_sparse(): # P = speye(STATES, STATES).tocsr() # assert inst.checkSquareStochastic(P) == None # exampleForest ... ... @@ -103,9 +104,9 @@ def test_exampleForest_shape(): [0, 1], [4, 2]])).all() def test_exampleForest_check(): P, R = exampleForest(10, 5, 3, 0.2) inst.check(P, R) #def test_exampleForest_check(): # P, R = exampleForest(10, 5, 3, 0.2) # inst.check(P, R) # exampleRand ... ... @@ -114,18 +115,18 @@ def test_exampleRand_dense_shape(): assert (P.shape == (ACTIONS, STATES, STATES)) assert (R.shape == (ACTIONS, STATES, STATES)) def test_exampleRand_dense_check(): P, R = exampleRand(STATES, ACTIONS) assert inst.check(P, R) == None #def test_exampleRand_dense_check(): # P, R = exampleRand(STATES, ACTIONS) # assert inst.check(P, R) == None def test_exampleRand_sparse_shape(): P, R = exampleRand(STATES, ACTIONS, is_sparse=True) assert (P.shape == (ACTIONS, )) assert (R.shape == (ACTIONS, )) def test_exampleRand_sparse_check(): P, R = exampleRand(STATES, ACTIONS, is_sparse=True) assert inst.check(P, R) == None #def test_exampleRand_sparse_check(): # P, R = exampleRand(STATES, ACTIONS, is_sparse=True) # assert inst.check(P, R) == None # ValueIteration ... ...
