Commit 361b42e0 by Steven Cordwell

### changed PolicyIteration to be more like original

parent 60e89bf1
 ... @@ -699,68 +699,54 @@ class PolicyIteration(MDP): ... @@ -699,68 +699,54 @@ class PolicyIteration(MDP): """ """ def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0): def __init__(self, transitions, reward, discount, policy0, max_iter=1000, eval_type=0): """""" """""" MDP.__init__(self) self.check(transitions, reward) self.S = transitions.shape[1] self.A = transitions.shape[0] self.P = transitions if (size(policy0,1) != S or any(mod(policy0, 1)) or any(policy0 < 1) or any(policy0 > S)): raise ValueError('MDP Toolbox ERROR: policy0 must a (1xS) vector with integer from 1 to S') self.R = reward MDP.__init__(self, transitions, reward, discount, max_iter) #self.computePR(transitions, reward) if (initial_value == 0): self.value = zeros((self.S)) #self.value = matrix(zeros((self.S, 1))) else: if (len(initial_value) != self.S): raise ValueError("The initial value must be length S") self.value = matrix(initial_value) self.policy = randi(0, self.A, self.S) self.value = matrix(zeros((self.S, 1))) self.discount = discount self.max_iter = max_iter self.iter = 0 # initialise the policy to the one which maximises the expected # immediate reward self.bellmanOperator() def evalPolicyMatrix(self): """""" pass def iterate(self): def iterate(self): """""" """""" done = False done = False stop_criterion = 0.01 if self.verbose: print(' Iteration Number_of_different_actions') self.time = time() while not done: while not done: stop = False self.iter = self.iter + 1 while not stop: change = 0 for s in range(self.S): v = self.value[s] a = self.policy[s] self.value[s] = (self.P[a, s, :] * (self.R[a, s, :] + (self.discount * self.value))).sum() change = max(change, abs(v - self.value[s])) if change < stop_criterion: stop = True policy_stable = True if eval_type == 0: for s in range(self.S): self.value = self.evalPolicyMatrix() b = self.policy[s] else: self.policy[s] = (self.P[:, s, :] * (self.R[:, s, :] + self.value = self.evalPolicyIterative() (self.discount * self.value))).sum(1).argmax() if b != self.policy[s]: policy_stable = False if policy_stable: policy_prev = self.policy self.bellmanOperator() n_different = (policy != policy_prev).sum() if self.verbose: print(' %s %s') % (self.iter, n_different) if (policy == policy_prev).all() or (self.iter == self.max_iter): done = True done = True self.time = time() - self.time # store value and policy as tuples # store value and policy as tuples self.value = tuple(array(self.value).reshape(self.S).tolist()) self.value = tuple(array(self.value).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) self.policy = tuple(array(self.policy).reshape(self.S).tolist()) ... ...
 ... @@ -5,89 +5,90 @@ Created on Sun May 27 23:16:57 2012 ... @@ -5,89 +5,90 @@ Created on Sun May 27 23:16:57 2012 @author: - @author: - """ """ from mdp import exampleForest, exampleRand, MDP, PolicyIteration, ValueIteration from mdp import exampleForest, exampleRand, PolicyIteration, ValueIteration from numpy import array, eye, matrix, zeros from numpy import array from numpy.random import rand #from numpy import array, eye, matrix, zeros from scipy.sparse import eye as speye #from numpy.random import rand from scipy.sparse import csr_matrix as sparse #from scipy.sparse import eye as speye #from scipy.sparse import csr_matrix as sparse #from scipy.stats.distributions import poisson #from scipy.stats.distributions import poisson inst = MDP() #inst = MDP() # STATES = 10 STATES = 10 ACTIONS = 3 ACTIONS = 3 # # check: square, stochastic and non-negative ## check: square, stochastic and non-negative # def test_check_square_stochastic_nonnegative_array(): #def test_check_square_stochastic_nonnegative_array(): P = zeros((ACTIONS, STATES, STATES)) # P = zeros((ACTIONS, STATES, STATES)) R = zeros((STATES, ACTIONS)) # R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): # for a in range(ACTIONS): P[a, :, :] = eye(STATES) # P[a, :, :] = eye(STATES) R[:, a] = rand(STATES) # R[:, a] = rand(STATES) inst.check(P, R) # inst.check(P, R) # # check: square, stochastic and non-negative object arrays ## check: square, stochastic and non-negative object arrays # def test_check_square_stochastic_nonnegative_object_array(): #def test_check_square_stochastic_nonnegative_object_array(): P = zeros((ACTIONS, ), dtype=object) # P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) # R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): # for a in range(ACTIONS): P[a] = eye(STATES) # P[a] = eye(STATES) R[:, a] = rand(STATES) # R[:, a] = rand(STATES) inst.check(P, R) # inst.check(P, R) # def test_check_square_stochastic_nonnegative_object_matrix(): #def test_check_square_stochastic_nonnegative_object_matrix(): P = zeros((ACTIONS, ), dtype=object) # P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) # R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): # for a in range(ACTIONS): P[a] = matrix(eye(STATES)) # P[a] = matrix(eye(STATES)) R[:, a] = rand(STATES) # R[:, a] = rand(STATES) inst.check(P, R) # inst.check(P, R) # def test_check_square_stochastic_nonnegative_object_sparse(): #def test_check_square_stochastic_nonnegative_object_sparse(): P = zeros((ACTIONS, ), dtype=object) # P = zeros((ACTIONS, ), dtype=object) R = zeros((STATES, ACTIONS)) # R = zeros((STATES, ACTIONS)) for a in range(ACTIONS): # for a in range(ACTIONS): P[a] = speye(STATES, STATES).tocsr() # P[a] = speye(STATES, STATES).tocsr() R[:, a] = rand(STATES) # R[:, a] = rand(STATES) inst.check(P, R) # inst.check(P, R) # # checkSquareStochastic: square, stochastic and non-negative ## checkSquareStochastic: square, stochastic and non-negative # def test_checkSquareStochastic_square_stochastic_nonnegative_array(): #def test_checkSquareStochastic_square_stochastic_nonnegative_array(): P = rand(STATES, STATES) # P = rand(STATES, STATES) for s in range(STATES): # for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() # P[s, :] = P[s, :] / P[s, :].sum() assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # def test_checkSquareStochastic_square_stochastic_nonnegative_matrix(): #def test_checkSquareStochastic_square_stochastic_nonnegative_matrix(): P = rand(STATES, STATES) # P = rand(STATES, STATES) for s in range(STATES): # for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() # P[s, :] = P[s, :] / P[s, :].sum() P = matrix(P) # P = matrix(P) assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # def test_checkSquareStochastic_square_stochastic_nonnegative_sparse(): #def test_checkSquareStochastic_square_stochastic_nonnegative_sparse(): P = rand(STATES, STATES) # P = rand(STATES, STATES) for s in range(STATES): # for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() # P[s, :] = P[s, :] / P[s, :].sum() P = sparse(P) # P = sparse(P) assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # # checkSquareStochastic: eye ## checkSquareStochastic: eye # def test_checkSquareStochastic_eye_array(): #def test_checkSquareStochastic_eye_array(): P = eye(STATES) # P = eye(STATES) assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # def test_checkSquareStochastic_eye_matrix(): #def test_checkSquareStochastic_eye_matrix(): P = matrix(eye(STATES)) # P = matrix(eye(STATES)) assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # def test_checkSquareStochastic_eye_sparse(): #def test_checkSquareStochastic_eye_sparse(): P = speye(STATES, STATES).tocsr() # P = speye(STATES, STATES).tocsr() assert inst.checkSquareStochastic(P) == None # assert inst.checkSquareStochastic(P) == None # exampleForest # exampleForest ... @@ -103,9 +104,9 @@ def test_exampleForest_shape(): ... @@ -103,9 +104,9 @@ def test_exampleForest_shape(): [0, 1], [0, 1], [4, 2]])).all() [4, 2]])).all() def test_exampleForest_check(): #def test_exampleForest_check(): P, R = exampleForest(10, 5, 3, 0.2) # P, R = exampleForest(10, 5, 3, 0.2) inst.check(P, R) # inst.check(P, R) # exampleRand # exampleRand ... @@ -114,18 +115,18 @@ def test_exampleRand_dense_shape(): ... @@ -114,18 +115,18 @@ def test_exampleRand_dense_shape(): assert (P.shape == (ACTIONS, STATES, STATES)) assert (P.shape == (ACTIONS, STATES, STATES)) assert (R.shape == (ACTIONS, STATES, STATES)) assert (R.shape == (ACTIONS, STATES, STATES)) def test_exampleRand_dense_check(): #def test_exampleRand_dense_check(): P, R = exampleRand(STATES, ACTIONS) # P, R = exampleRand(STATES, ACTIONS) assert inst.check(P, R) == None # assert inst.check(P, R) == None def test_exampleRand_sparse_shape(): def test_exampleRand_sparse_shape(): P, R = exampleRand(STATES, ACTIONS, is_sparse=True) P, R = exampleRand(STATES, ACTIONS, is_sparse=True) assert (P.shape == (ACTIONS, )) assert (P.shape == (ACTIONS, )) assert (R.shape == (ACTIONS, )) assert (R.shape == (ACTIONS, )) def test_exampleRand_sparse_check(): #def test_exampleRand_sparse_check(): P, R = exampleRand(STATES, ACTIONS, is_sparse=True) # P, R = exampleRand(STATES, ACTIONS, is_sparse=True) assert inst.check(P, R) == None # assert inst.check(P, R) == None # ValueIteration # ValueIteration ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!