Commit 6a58e49b by Steven Cordwell

### Merge branch 'rowops' into stable

parents 35f9cbd0 23464d78
 ... ... @@ -16,4 +16,4 @@ 7. Try to use rows for the Bellman computations rather than columns (perhaps this storage is more Pythonic? or more NumPyic?) 8. 8. Create the mask as a sparse matrix in exampleRand
 ... ... @@ -50,6 +50,8 @@ Code snippets are indicated by three greater-than signs:: >>> x = 17 >>> x = x + 1 >>> x 18 The documentation can be displayed with `IPython `_. For example, to view the docstring of ... ... @@ -96,7 +98,7 @@ from math import ceil, log, sqrt from random import random from time import time from numpy import absolute, array, diag, empty, matrix, mean, mod, multiply from numpy import absolute, array, diag, empty, mean, mod, multiply from numpy import ndarray, ones, where, zeros from numpy.random import rand, randint from scipy.sparse import csr_matrix as sparse ... ... @@ -724,7 +726,9 @@ class MDP(object): # set the initial iteration count to zero self.iter = 0 # V should be stored as a vector ie shape of (S,) or (1, S) self.V = None # policy can also be stored as a vector self.policy = None def _bellmanOperator(self, V=None): ... ... @@ -742,18 +746,18 @@ class MDP(object): V = self.V else: try: if V.shape != (self.S, 1): if V.shape not in ((self.S,), (1, self.S)): raise ValueError("bellman: V is not the right shape.") except AttributeError: raise TypeError("bellman: V must be a numpy array or matrix.") Q = matrix(empty((self.S, self.A))) Q = empty((self.A, self.S)) for aa in range(self.A): Q[:, aa] = self.R[aa] + (self.discount * self.P[aa] * V) Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V) # Which way is better? # 1. Return, (policy, value) return (Q.argmax(axis=1), Q.max(axis=1)) return (Q.argmax(axis=0), Q.max(axis=0)) # 2. update self.policy and self.V directly # self.V = Q.max(axis=1) # self.policy = Q.argmax(axis=1) ... ... @@ -796,19 +800,24 @@ class MDP(object): for aa in xrange(self.A): self.P.append(P[aa]) self.P = tuple(self.P) # Set self.R as a tuple of length A, with each element storing an S×1 # vector array. # Set self.R as a tuple of length A, with each element storing an 1×S # vector. try: if R.ndim == 2: self.R = [] for aa in xrange(self.A): self.R.append(R[:, aa].reshape(self.S, 1)) self.R.append(array(R[:, aa]).reshape(self.S)) else: raise AttributeError except AttributeError: self.R = [] for aa in xrange(self.A): self.R.append(multiply(P[aa], R[aa]).sum(1).reshape(self.S, 1)) try: self.R.append(P[aa].multiply(R[aa]).sum(1).reshape(self.S)) except AttributeError: self.R.append(multiply(P[aa],R[aa]).sum(1).reshape(self.S)) except: raise except: raise self.R = tuple(self.R) ... ... @@ -884,17 +893,18 @@ class FiniteHorizon(MDP): raise ValueError('PyMDPtoolbox: N must be greater than 0') else: self.N = N # Initialise the base class MDP.__init__(self, transitions, reward, discount, None, None) # remove the iteration counter # remove the iteration counter, it is not meaningful for backwards # induction del self.iter # There are value vectors for each time step up to the horizon self.V = zeros((self.S, N + 1)) self.policy = zeros((self.S, N), dtype=int) if not h is None: # There are policy vectors for each time step before the horizon, when # we reach the horizon we don't need to make decisions anymore. self.policy = empty((self.S, N), dtype=int) # Set the reward for the final transition to h, if specified. if h is not None: self.V[:, N] = h def iterate(self): ... ... @@ -902,15 +912,24 @@ class FiniteHorizon(MDP): self.time = time() for n in range(self.N): W, X = self._bellmanOperator( matrix(self.V[:, self.N - n]).reshape(self.S, 1)) self.V[:, self.N - n - 1] = X.A1 self.policy[:, self.N - n - 1] = W.A1 W, X = self._bellmanOperator(self.V[:, self.N - n]) self.V[:, self.N - n - 1] = X self.policy[:, self.N - n - 1] = W if self.verbose: print("stage: %s ... policy transpose : %s") % ( self.N - n, self.policy[:, self.N - n -1].tolist()) self.time = time() - self.time # After this we could create a tuple of tuples for the values and # policies. #V = [] #p = [] #for n in xrange(self.N): # V.append() # p.append() #V.append() #self.V = tuple(V) #self.policy = tuple(p) class LP(MDP): ... ... @@ -987,15 +1006,15 @@ class LP(MDP): # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the # default cvxopt solver) then V agrees with the Octave equivalent # only to 10e-8 places. self.V = matrix(self._linprog(f, M, -h, solver='glpk')['x']) self.V = array(self._linprog(f, M, -h, solver='glpk')['x']) self.policy, self.V = self._bellmanOperator() self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist()) class PolicyIteration(MDP): ... ... @@ -1049,7 +1068,7 @@ class PolicyIteration(MDP): if policy0 == None: # initialise the policy to the one which maximises the expected # immediate reward self.V = matrix(zeros((self.S, 1))) self.V = zeros(self.S) self.policy, null = self._bellmanOperator() del null else: ... ... @@ -1059,7 +1078,7 @@ class PolicyIteration(MDP): raise ValueError("PyMDPtolbox: policy0 must a vector with " "length S.") policy0 = matrix(policy0.reshape(self.S, 1)) policy0 = policy0.reshape(self.S) if (mod(policy0, 1).any() or (policy0 < 0).any() or (policy0 >= self.S).any()): ... ... @@ -1069,7 +1088,7 @@ class PolicyIteration(MDP): self.policy = policy0 # set or reset the initial values to zero self.V = matrix(zeros((self.S, 1))) self.V = zeros(self.S) if eval_type in (0, "matrix"): from numpy.linalg import solve ... ... @@ -1106,29 +1125,24 @@ class PolicyIteration(MDP): PRpolicy(S) = reward matrix for policy """ Ppolicy = matrix(zeros((self.S, self.S))) Rpolicy = matrix(zeros((self.S, 1))) Ppolicy = empty((self.S, self.S)) Rpolicy = zeros(self.S) for aa in range(self.A): # avoid looping over S # the rows that use action a. .getA1() is used to make sure that # ind is a 1 dimensional vector ind = (self.policy == aa).nonzero()[0].getA1() # the rows that use action a. ind = (self.policy == aa).nonzero()[0] # if no rows use action a, then no need to assign this if ind.size > 0: Ppolicy[ind, :] = self.P[aa][ind, :] #PR = self._computePR() # an apparently uneeded line, and # perhaps harmful in this implementation c.f. # mdp_computePpolicyPRpolicy.m Rpolicy[ind] = self.R[aa][ind] # self.R cannot be sparse with the code in its current condition, but # it should be possible in the future. Also, if R is so big that its # a good idea to use a sparse matrix for it, then converting PRpolicy # from a dense to sparse matrix doesn't seem very memory efficient if type(self.R) is sparse: Rpolicy = sparse(Rpolicy) #self.Ppolicy = Ppolicy #self.Rpolicy = Rpolicy return (Ppolicy, Rpolicy) ... ... @@ -1166,9 +1180,9 @@ class PolicyIteration(MDP): """ if (type(V0) in (int, float)) and (V0 == 0): policy_V = zeros((self.S, 1)) policy_V = zeros(self.S) else: if (type(V0) in (ndarray, matrix)) and (V0.shape == (self.S, 1)): if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)): policy_V = V0 else: raise ValueError("PyMDPtoolbox: V0 vector/array type not " ... ... @@ -1186,7 +1200,7 @@ class PolicyIteration(MDP): itr += 1 Vprev = policy_V policy_V = policy_R + self.discount * policy_P * Vprev policy_V = policy_R + self.discount * policy_P.dot(Vprev) variation = absolute(policy_V - Vprev).max() if self.verbose: ... ... @@ -1279,8 +1293,8 @@ class PolicyIteration(MDP): self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist()) class PolicyIterationModified(PolicyIteration): ... ... @@ -1354,7 +1368,7 @@ class PolicyIterationModified(PolicyIteration): self.epsilon = epsilon if discount == 1: self.V = matrix(zeros((self.S, 1))) self.V = zeros((self.S, 1)) else: # min(min()) is not right self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1)) ... ... @@ -1395,8 +1409,8 @@ class PolicyIterationModified(PolicyIteration): self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist()) class QLearning(MDP): ... ... @@ -1636,7 +1650,7 @@ class RelativeValueIteration(MDP): self.epsilon = epsilon self.discount = 1 self.V = matrix(zeros((self.S, 1))) self.V = zeros(self.S) self.gain = 0 # self.U[self.S] self.average_reward = None ... ... @@ -1681,8 +1695,8 @@ class RelativeValueIteration(MDP): self.time = time() - self.time # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist()) class ValueIteration(MDP): ... ... @@ -1727,7 +1741,7 @@ class ValueIteration(MDP): Greater than 0, optional (default: computed) initial_value : array, optional starting value function optional (default: zeros(S,1)). optional (default: zeros(S,)). Data Attributes --------------- ... ... @@ -1792,7 +1806,7 @@ class ValueIteration(MDP): >>> import mdp >>> import numpy as np >>> from scipy.sparse import csr_matrix as sparse >>> P = np.empty(2, dtype=object) >>> P = [None] * 2 >>> P[0] = sparse([[0.5, 0.5],[0.8, 0.2]]) >>> P[1] = sparse([[0, 1],[0.1, 0.9]]) >>> R = np.array([[5, 10], [-1, 2]]) ... ... @@ -1813,15 +1827,18 @@ class ValueIteration(MDP): # initialization of optional arguments if initial_value == 0: self.V = matrix(zeros((self.S, 1))) self.V = zeros(self.S) else: if not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S)): raise ValueError("PyMDPtoolbox: The initial value must be a " "vector of length S.") if len(initial_value) != self.S: raise ValueError("PyMDPtoolbox: The initial value must be " "a vector of length S.") else: self.V = matrix(initial_value) try: self.V = initial_value.reshape(self.S) except AttributeError: self.V = array(initial_value) except: raise if self.discount < 1: # compute a bound for the number of iterations and update the # stored value of self.max_iter ... ... @@ -1860,12 +1877,12 @@ class ValueIteration(MDP): h = zeros(self.S) for ss in range(self.S): PP = zeros((self.S, self.A)) PP = zeros((self.A, self.S)) for aa in range(self.A): try: PP[:, aa] = self.P[aa][:, ss] PP[aa] = self.P[aa][:, ss] except ValueError: PP[:, aa] = self.P[aa][:, ss].todense() PP[aa] = self.P[aa][:, ss].todense().A1 except: raise # the function "min()" without any arguments finds the ... ... @@ -1918,8 +1935,8 @@ class ValueIteration(MDP): "iteration condition.") # store value and policy as tuples self.V = tuple(self.V.getA1().tolist()) self.policy = tuple(self.policy.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist()) self.time = time() - self.time ... ... @@ -1986,10 +2003,9 @@ class ValueIterationGS(ValueIteration): Vprev = self.V.copy() for s in range(self.S): Q = [] for a in range(self.A): Q.append(float(self.R[a][s] + self.discount * self.P[a][s, :] * self.V)) Q = [float(self.R[a][s]+ self.discount * self.P[a][s, :].dot(self.V)) for a in range(self.A)] self.V[s] = max(Q) ... ... @@ -2014,14 +2030,14 @@ class ValueIterationGS(ValueIteration): for s in range(self.S): Q = zeros(self.A) for a in range(self.A): Q[a] = self.R[a][s] + self.P[a][s,:] * self.discount * self.V Q[a] = self.R[a][s] + self.discount * self.P[a][s,:].dot(self.V) self.V[s] = Q.max() self.policy.append(int(Q.argmax())) self.time = time() - self.time self.V = tuple(self.V.getA1().tolist()) self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy) if __name__ == "__main__": ... ...
 ... ... @@ -10,7 +10,7 @@ nosetests installed, and then run from the command line. """ from random import seed as randseed #from random import seed as randseed from numpy import absolute, array, empty, eye, matrix, zeros from numpy.random import rand ... ... @@ -18,9 +18,7 @@ from scipy.sparse import eye as speye from scipy.sparse import csr_matrix as sparse #from scipy.stats.distributions import poisson from mdp import check, checkSquareStochastic, exampleForest, exampleRand from mdp import MDP, PolicyIteration, QLearning, RelativeValueIteration from mdp import ValueIteration, ValueIterationGS import mdp STATES = 10 ACTIONS = 3 ... ... @@ -32,9 +30,9 @@ R = array([[5, 10], [-1, 2]]) Ps = empty(2, dtype=object) Ps[0] = sparse([[0.5, 0.5],[0.8, 0.2]]) Ps[1] = sparse([[0, 1],[0.1, 0.9]]) Pf, Rf = exampleForest() Pr, Rr = exampleRand(STATES, ACTIONS) Prs, Rrs = exampleRand(STATES, ACTIONS, is_sparse=True) Pf, Rf = mdp.exampleForest() Pr, Rr = mdp.exampleRand(STATES, ACTIONS) Prs, Rrs = mdp.exampleRand(STATES, ACTIONS, is_sparse=True) # check: square, stochastic and non-negative ndarrays ... ... @@ -44,14 +42,14 @@ def test_check_square_stochastic_nonnegative_array_1(): for a in range(ACTIONS): P[a, :, :] = eye(STATES) R[:, a] = rand(STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_square_stochastic_nonnegative_array_2(): P = zeros((ACTIONS, STATES, STATES)) R = rand(ACTIONS, STATES, STATES) for a in range(ACTIONS): P[a, :, :] = eye(STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) # check: P - square, stochastic and non-negative object arrays ... ... @@ -60,21 +58,21 @@ def test_check_P_square_stochastic_nonnegative_object_array(): R = rand(STATES, ACTIONS) for a in range(ACTIONS): P[a] = eye(STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_object_matrix(): P = empty(ACTIONS, dtype=object) R = rand(STATES, ACTIONS) for a in range(ACTIONS): P[a] = matrix(eye(STATES)) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_object_sparse(): P = empty(ACTIONS, dtype=object) R = rand(STATES, ACTIONS) for a in range(ACTIONS): P[a] = speye(STATES, STATES).tocsr() assert (check(P, R) == None) assert (mdp.check(P, R) == None) # check: P - square, stochastic and non-negative lists ... ... @@ -83,21 +81,21 @@ def test_check_P_square_stochastic_nonnegative_list_array(): R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P.append(eye(STATES)) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_list_matrix(): P = [] R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P.append(matrix(eye(STATES))) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_list_sparse(): P = [] R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P.append(speye(STATES, STATES).tocsr()) assert (check(P, R) == None) assert (mdp.check(P, R) == None) # check: P - square, stochastic and non-negative dicts ... ... @@ -106,21 +104,21 @@ def test_check_P_square_stochastic_nonnegative_dict_array(): R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P[a] = eye(STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_dict_matrix(): P = {} R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P[a] = matrix(eye(STATES)) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_P_square_stochastic_nonnegative_dict_sparse(): P = {} R = rand(STATES, ACTIONS) for a in xrange(ACTIONS): P[a] = speye(STATES, STATES).tocsr() assert (check(P, R) == None) assert (mdp.check(P, R) == None) # check: R - square stochastic and non-negative sparse ... ... @@ -129,7 +127,7 @@ def test_check_R_square_stochastic_nonnegative_sparse(): R = sparse(rand(STATES, ACTIONS)) for a in range(ACTIONS): P[a, :, :] = eye(STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) # check: R - square, stochastic and non-negative object arrays ... ... @@ -139,7 +137,7 @@ def test_check_R_square_stochastic_nonnegative_object_array(): for a in range(ACTIONS): P[a, :, :] = eye(STATES) R[a] = rand(STATES, STATES) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_R_square_stochastic_nonnegative_object_matrix(): P = zeros((ACTIONS, STATES, STATES)) ... ... @@ -147,7 +145,7 @@ def test_check_R_square_stochastic_nonnegative_object_matrix(): for a in range(ACTIONS): P[a, :, :] = eye(STATES) R[a] = matrix(rand(STATES, STATES)) assert (check(P, R) == None) assert (mdp.check(P, R) == None) def test_check_R_square_stochastic_nonnegative_object_sparse(): P = zeros((ACTIONS, STATES, STATES)) ... ... @@ -155,7 +153,7 @@ def test_check_R_square_stochastic_nonnegative_object_sparse(): for a in range(ACTIONS): P[a, :, :] = eye(STATES) R[a] = sparse(rand(STATES, STATES)) assert (check(P, R) == None) assert (mdp.check(P, R) == None) # checkSquareStochastic: square, stochastic and non-negative ... ... @@ -163,35 +161,35 @@ def test_checkSquareStochastic_square_stochastic_nonnegative_array(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None def test_checkSquareStochastic_square_stochastic_nonnegative_matrix(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() P = matrix(P) assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None def test_checkSquareStochastic_square_stochastic_nonnegative_sparse(): P = rand(STATES, STATES) for s in range(STATES): P[s, :] = P[s, :] / P[s, :].sum() P = sparse(P) assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None # checkSquareStochastic: eye def test_checkSquareStochastic_eye_array(): P = eye(STATES) assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None def test_checkSquareStochastic_eye_matrix(): P = matrix(eye(STATES)) assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None def test_checkSquareStochastic_eye_sparse(): P = speye(STATES, STATES).tocsr() assert checkSquareStochastic(P) == None assert mdp.checkSquareStochastic(P) == None # exampleForest ... ... @@ -209,8 +207,8 @@ def test_exampleForest_R_shape(): [4, 2]])).all() def test_exampleForest_check(): P, R = exampleForest(10, 5, 3, 0.2) assert check(P, R) == None P, R = mdp.exampleForest(10, 5, 3, 0.2) assert mdp.check(P, R) == None # exampleRand ... ... @@ -221,29 +219,29 @@ def test_exampleRand_dense_R_shape(): assert (Rr.shape == (ACTIONS, STATES, STATES)) def test_exampleRand_dense_check(): assert check(Pr, Rr) == None assert mdp.check(Pr, Rr) == None def test_exampleRand_sparse_P_shape(): assert (Prs.shape == (ACTIONS, )) assert (len(Prs) == ACTIONS) def test_exampleRand_sparse_R_shape(): assert (Rrs.shape == (ACTIONS, )) assert (len(Rrs) == ACTIONS) def test_exampleRand_sparse_check(): assert check(Prs, Rrs) == None assert mdp.check(Prs, Rrs) == None # MDP def test_MDP_P_R_1(): P1 = [] P1.append(matrix('0.5 0.5; 0.8 0.2'))