Commit 8bf7fe2a by Steven Cordwell

### [mdp] Break apart mdp module into smaller pieces

```Initial attempt at making the mdp module more manageable. It has too
many lines of code making it hard to work on any one algorithm.```
parent 079751c4
 # -*- coding: utf-8 -*- """ Created on Fri Jan 9 13:41:56 2015 @author: steve """ import time as _time import numpy as _np from mdptoolbox.mdp_base import MDP class FiniteHorizon(MDP): """A MDP solved using the finite-horizon backwards induction algorithm. Parameters ---------- transitions : array Transition probability matrices. See the documentation for the ``MDP`` class for details. reward : array Reward matrices or vectors. See the documentation for the ``MDP`` class for details. discount : float Discount factor. See the documentation for the ``MDP`` class for details. N : int Number of periods. Must be greater than 0. h : array, optional Terminal reward. Default: a vector of zeros. Data Attributes --------------- V : array Optimal value function. Shape = (S, N+1). ``V[:, n]`` = optimal value function at stage ``n`` with stage in {0, 1...N-1}. ``V[:, N]`` value function for terminal stage. policy : array Optimal policy. ``policy[:, n]`` = optimal policy at stage ``n`` with stage in {0, 1...N}. ``policy[:, N]`` = policy for stage ``N``. time : float used CPU time Notes ----- In verbose mode, displays the current stage and policy transpose. Examples -------- >>> import mdptoolbox, mdptoolbox.example >>> P, R = mdptoolbox.example.forest() >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3) >>> fh.run() >>> fh.V array([[ 2.6973, 0.81 , 0. , 0. ], [ 5.9373, 3.24 , 1. , 0. ], [ 9.9373, 7.24 , 4. , 0. ]]) >>> fh.policy array([[0, 0, 0], [0, 0, 1], [0, 0, 0]]) """ def __init__(self, transitions, reward, discount, N, h=None): # Initialise a finite horizon MDP. self.N = int(N) assert self.N > 0, "N must be greater than 0." # Initialise the base class MDP.__init__(self, transitions, reward, discount, None, None) # remove the iteration counter, it is not meaningful for backwards # induction del self.iter # There are value vectors for each time step up to the horizon self.V = _np.zeros((self.S, N + 1)) # There are policy vectors for each time step before the horizon, when # we reach the horizon we don't need to make decisions anymore. self.policy = _np.empty((self.S, N), dtype=int) # Set the reward for the final transition to h, if specified. if h is not None: self.V[:, N] = h def run(self): # Run the finite horizon algorithm. self.time = _time.time() # loop through each time period for n in range(self.N): W, X = self._bellmanOperator(self.V[:, self.N - n]) stage = self.N - n - 1 self.V[:, stage] = X self.policy[:, stage] = W if self.verbose: print(("stage: %s, policy: %s") % ( stage, self.policy[:, stage].tolist())) # update time spent running self.time = _time.time() - self.time # After this we could create a tuple of tuples for the values and # policies. #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N)) #self.policy = tuple(tuple(self.policy[:, n].tolist()) # for n in range(self.N))
 # -*- coding: utf-8 -*- """ """ import time as _time import numpy as _np import scipy.sparse as _sp from mdptoolbox.mdp_base import MDP class LP(MDP): """A discounted MDP soloved using linear programming. This class requires the Python ``cvxopt`` module to be installed. Arguments --------- transitions : array Transition probability matrices. See the documentation for the ``MDP`` class for details. reward : array Reward matrices or vectors. See the documentation for the ``MDP`` class for details. discount : float Discount factor. See the documentation for the ``MDP`` class for details. h : array, optional Terminal reward. Default: a vector of zeros. Data Attributes --------------- V : tuple optimal values policy : tuple optimal policy time : float used CPU time Examples -------- >>> import mdptoolbox.example >>> P, R = mdptoolbox.example.forest() >>> lp = mdptoolbox.mdp.LP(P, R, 0.9) >>> lp.run() >>> import numpy, mdptoolbox >>> P = numpy.array((((0.5, 0.5), (0.8, 0.2)), ((0, 1), (0.1, 0.9)))) >>> R = numpy.array(((5, 10), (-1, 2))) >>> lp = mdptoolbox.mdp.LP(P, R, 0.9) >>> lp.run() >>> #lp.policy #FIXME: gives (1, 1), should be (1, 0) """ def __init__(self, transitions, reward, discount): # Initialise a linear programming MDP. # import some functions from cvxopt and set them as object methods try: from cvxopt import matrix, solvers self._linprog = solvers.lp self._cvxmat = matrix except ImportError: raise ImportError("The python module cvxopt is required to use " "linear programming functionality.") # initialise the MDP. epsilon and max_iter are not needed MDP.__init__(self, transitions, reward, discount, None, None) # Set the cvxopt solver to be quiet by default, but ... # this doesn't do what I want it to do c.f. issue #3 if not self.verbose: solvers.options['show_progress'] = False def run(self): #Run the linear programming algorithm. self.time = _time.time() # The objective is to resolve : min V / V >= PR + discount*P*V # The function linprog of the optimisation Toolbox of Mathworks # resolves : # min f'* x / M * x <= b # So the objective could be expressed as : # min V / (discount*P-I) * V <= - PR # To avoid loop on states, the matrix M is structured following actions # M(A*S,S) f = self._cvxmat(_np.ones((self.S, 1))) h = _np.array(self.R).reshape(self.S * self.A, 1, order="F") h = self._cvxmat(h, tc='d') M = _np.zeros((self.A * self.S, self.S)) for aa in range(self.A): pos = (aa + 1) * self.S M[(pos - self.S):pos, :] = ( self.discount * self.P[aa] - _sp.eye(self.S, self.S)) M = self._cvxmat(M) # Using the glpk option will make this behave more like Octave # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the # default cvxopt solver) then V agrees with the Octave equivalent # only to 10e-8 places. This assumes glpk is installed of course. self.V = _np.array(self._linprog(f, M, -h)['x']).reshape(self.S) # apply the Bellman operator self.policy, self.V = self._bellmanOperator() # update the time spent solving self.time = _time.time() - self.time # store value and policy as tuples self.V = tuple(self.V.tolist()) self.policy = tuple(self.policy.tolist())
 ... ... @@ -57,1465 +57,12 @@ ValueIterationGS # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import math as _math import time as _time import numpy as _np import scipy.sparse as _sp import mdptoolbox.util as _util _MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \ "condition." _MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \ "policy found." _MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \ "function found." _MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found." class MDP(object): """A Markov Decision Problem. Let ``S`` = the number of states, and ``A`` = the number of acions. Parameters ---------- transitions : array Transition probability matrices. These can be defined in a variety of ways. The simplest is a numpy array that has the shape ``(A, S, S)``, though there are other possibilities. It can be a tuple or list or numpy object array of length ``A``, where each element contains a numpy array or matrix that has the shape ``(S, S)``. This "list of matrices" form is useful when the transition matrices are sparse as ``scipy.sparse.csr_matrix`` matrices can be used. In summary, each action's transition matrix must be indexable like ``transitions[a]`` where ``a`` ∈ {0, 1...A-1}, and ``transitions[a]`` returns an ``S`` × ``S`` array-like object. reward : array Reward matrices or vectors. Like the transition matrices, these can also be defined in a variety of ways. Again the simplest is a numpy array that has the shape ``(S, A)``, ``(S,)`` or ``(A, S, S)``. A list of lists can be used, where each inner list has length ``S`` and the outer list has length ``A``. A list of numpy arrays is possible where each inner array can be of the shape ``(S,)``, ``(S, 1)``, ``(1, S)`` or ``(S, S)``. Also ``scipy.sparse.csr_matrix`` can be used instead of numpy arrays. In addition, the outer list can be replaced by any object that can be indexed like ``reward[a]`` such as a tuple or numpy object array of length ``A``. discount : float Discount factor. The per time-step discount factor on future rewards. Valid values are greater than 0 upto and including 1. If the discount factor is 1, then convergence is cannot be assumed and a warning will be displayed. Subclasses of ``MDP`` may pass ``None`` in the case where the algorithm does not use a discount factor. epsilon : float Stopping criterion. The maximum change in the value function at each iteration is compared against ``epsilon``. Once the change falls below this value, then the value function is considered to have converged to the optimal value function. Subclasses of ``MDP`` may pass ``None`` in the case where the algorithm does not use an epsilon-optimal stopping criterion. max_iter : int Maximum number of iterations. The algorithm will be terminated once this many iterations have elapsed. This must be greater than 0 if specified. Subclasses of ``MDP`` may pass ``None`` in the case where the algorithm does not use a maximum number of iterations. Attributes ---------- P : array Transition probability matrices. R : array Reward vectors. V : tuple The optimal value function. Each element is a float corresponding to the expected value of being in that state assuming the optimal policy is followed. discount : float The discount rate on future rewards. max_iter : int The maximum number of iterations. policy : tuple The optimal policy. time : float The time used to converge to the optimal policy. verbose : boolean Whether verbose output should be displayed or not. Methods ------- run Implemented in child classes as the main algorithm loop. Raises an exception if it has not been overridden. setSilent Turn the verbosity off setVerbose Turn the verbosity on """ def __init__(self, transitions, reward, discount, epsilon, max_iter): # Initialise a MDP based on the input parameters. # if the discount is None then the algorithm is assumed to not use it # in its computations if discount is not None: self.discount = float(discount) assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]" if self.discount == 1: print("WARNING: check conditions of convergence. With no " "discount, convergence can not be assumed.") # if the max_iter is None then the algorithm is assumed to not use it # in its computations if max_iter is not None: self.max_iter = int(max_iter) assert self.max_iter > 0, "The maximum number of iterations " \ "must be greater than 0." # check that epsilon is something sane if epsilon is not None: self.epsilon = float(epsilon) assert self.epsilon > 0, "Epsilon must be greater than 0." # we run a check on P and R to make sure they are describing an MDP. If # an exception isn't raised then they are assumed to be correct. _util.check(transitions, reward) # computePR will assign the variables self.S, self.A, self.P and self.R self._computePR(transitions, reward) # the verbosity is by default turned off self.verbose = False # Initially the time taken to perform the computations is set to None self.time = None # set the initial iteration count to zero self.iter = 0 # V should be stored as a vector ie shape of (S,) or (1, S) self.V = None # policy can also be stored as a vector self.policy = None def __repr__(self): P_repr = "P: \n" R_repr = "R: \n" for aa in range(self.A): P_repr += repr(self.P[aa]) + "\n" R_repr += repr(self.R[aa]) + "\n" return(P_repr + "\n" + R_repr) def _bellmanOperator(self, V=None): # Apply the Bellman operator on the value function. # # Updates the value function and the Vprev-improving policy. # # Returns: (policy, value), tuple of new policy and its value # # If V hasn't been sent into the method, then we assume to be working # on the objects V attribute if V is None: # this V should be a reference to the data rather than a copy V = self.V else: # make sure the user supplied V is of the right shape try: assert V.shape in ((self.S,), (1, self.S)), "V is not the " \ "right shape (Bellman operator)." except AttributeError: raise TypeError("V must be a numpy array or matrix.") # Looping through each action the the Q-value matrix is calculated. # P and V can be any object that supports indexing, so it is important # that you know they define a valid MDP before calling the # _bellmanOperator method. Otherwise the results will be meaningless. Q = _np.empty((self.A, self.S)) for aa in range(self.A): Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V) # Get the policy and value, for now it is being returned but... # Which way is better? # 1. Return, (policy, value) return (Q.argmax(axis=0), Q.max(axis=0)) # 2. update self.policy and self.V directly # self.V = Q.max(axis=1) # self.policy = Q.argmax(axis=1) def _computeP(self, P): # Set self.P as a tuple of length A, with each element storing an S×S # matrix. self.A = len(P) try: if P.ndim == 3: self.S = P.shape[1] else: self.S = P[0].shape[0] except AttributeError: self.S = P[0].shape[0] # convert P to a tuple of numpy arrays self.P = tuple(P[aa] for aa in range(self.A)) def _computePR(self, P, R): # Compute the reward for the system in one state chosing an action. # Arguments # --------- # Let S = number of states, A = number of actions # P(SxSxA) = transition matrix # P could be an array with 3 dimensions or a cell array (1xA), # each cell containing a matrix (SxS) possibly sparse # R(SxSxA) or (SxA) = reward matrix # R could be an array with 3 dimensions (SxSxA) or a cell array # (1xA), each cell containing a sparse matrix (SxS) or a 2D # array(SxA) possibly sparse # Evaluation # ---------- # PR(SxA) = reward matrix # # We assume that P and R define a MDP i,e. assumption is that # _util.check(P, R) has already been run and doesn't fail. # # First compute store P, S, and A self._computeP(P) # Set self.R as a tuple of length A, with each element storing an 1×S # vector. try: if R.ndim == 1: r = _np.array(R).reshape(self.S) self.R = tuple(r for aa in range(self.A)) elif R.ndim == 2: self.R = tuple(_np.array(R[:, aa]).reshape(self.S) for aa in range(self.A)) else: self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) for aa in range(self.A)) except AttributeError: if len(R) == self.A: self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) for aa in range(self.A)) else: r = _np.array(R).reshape(self.S) self.R = tuple(r for aa in range(self.A)) def run(self): # Raise error because child classes should implement this function. raise NotImplementedError("You should create a run() method.") def setSilent(self): """Set the MDP algorithm to silent mode.""" self.verbose = False def setVerbose(self): """Set the MDP algorithm to verbose mode.""" self.verbose = True class FiniteHorizon(MDP): """A MDP solved using the finite-horizon backwards induction algorithm. Parameters ---------- transitions : array Transition probability matrices. See the documentation for the ``MDP`` class for details. reward : array Reward matrices or vectors. See the documentation for the ``MDP`` class for details. discount : float Discount factor. See the documentation for the ``MDP`` class for details. N : int Number of periods. Must be greater than 0. h : array, optional Terminal reward. Default: a vector of zeros. Data Attributes --------------- V : array Optimal value function. Shape = (S, N+1). ``V[:, n]`` = optimal value function at stage ``n`` with stage in {0, 1...N-1}. ``V[:, N]`` value function for terminal stage. policy : array Optimal policy. ``policy[:, n]`` = optimal policy at stage ``n`` with stage in {0, 1...N}. ``policy[:, N]`` = policy for stage ``N``. time : float used CPU time Notes ----- In verbose mode, displays the current stage and policy transpose. Examples -------- >>> import mdptoolbox, mdptoolbox.example >>> P, R = mdptoolbox.example.forest() >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3) >>> fh.run() >>> fh.V array([[ 2.6973, 0.81 , 0. , 0. ], [ 5.9373, 3.24 , 1. , 0. ], [ 9.9373, 7.24 , 4. , 0. ]]) >>> fh.policy array([[0, 0, 0], [0, 0, 1], [0, 0, 0]]) """ def __init__(self, transitions, reward, discount, N, h=None): # Initialise a finite horizon MDP. self.N = int(N) assert self.N > 0, "N must be greater than 0." # Initialise the base class MDP.__init__(self, transitions, reward, discount, None, None) # remove the iteration counter, it is not meaningful for backwards # induction del self.iter # There are value vectors for each time step up to the horizon self.V = _np.zeros((self.S, N + 1)) # There are policy vectors for each time step before the horizon, when # we reach the horizon we don't need to make decisions anymore. self.policy = _np.empty((self.S, N), dtype=int) # Set the reward for the final transition to h, if specified. if h is not None: self.V[:, N] = h def run(self): # Run the finite horizon algorithm. self.time = _time.time() # loop through each time period for n in range(self.N): W, X = self._bellmanOperator(self.V[:, self.N - n]) stage = self.N - n - 1 self.V[:, stage] = X self.policy[:, stage] = W if self.verbose: print(("stage: %s, policy: %s") % ( stage, self.policy[:, stage].tolist())) # update time spent running self.time = _time.time() - self.time # After this we could create a tuple of tuples for the values and # policies. #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N)) #self.policy = tuple(tuple(self.policy[:, n].tolist()) # for n in range(self.N)) class LP(MDP): """A discounted MDP soloved using linear programming. This class requires the Python ``cvxopt`` module to be installed. Arguments --------- transitions : array Transition probability matrices. See the documentation for the ``MDP`` class for details. reward : array Reward matrices or vectors. See the documentation for the ``MDP`` class for details. discount : float Discount factor. See the documentation for the ``MDP`` class for details. h : array, optional Terminal reward. Default: a vector of zeros. Data Attributes --------------- V : tuple optimal values policy : tuple optimal policy time : float used CPU time Examples -------- >>> import mdptoolbox.example >>> P, R = mdptoolbox.example.forest() >>> lp = mdptoolbox.mdp.LP(P, R, 0.9) >>> lp.run() >>> import numpy, mdptoolbox >>> P = numpy.array((((0.5, 0.5), (0.8, 0.2)), ((0, 1), (0.1, 0.9)))) >>> R = numpy.array(((5, 10), (-1, 2))) >>> lp = mdptoolbox.mdp.LP(P, R, 0.9) >>> lp.run() >>> #lp.policy #FIXME: gives (1, 1), should be (1, 0) """ def __init__(self, transitions, reward, discount): # Initialise a linear programming MDP. # import some functions from cvxopt and set them as object methods try: from cvxopt import matrix, solvers self._linprog = solvers.lp self._cvxmat = matrix except ImportError: raise ImportError("The python module cvxopt is required to use " "linear programming functionality.") # initialise the MDP. epsilon and max_iter are not needed MDP.__init__(self, transitions, reward, discount, None, None) # Set the cvxopt solver to be quiet by default, but ... # this doesn't do what I want it to do c.f. issue #3 if not self.verbose: solvers.options['show_progress'] = False def run(self): #Run the linear programming algorithm. self.time = _time.time() # The objective is to resolve : min V / V >= PR + discount*P*V # The function linprog of the optimisation Toolbox of Mathworks # resolves : # min f'* x / M * x <= b # So the objective could be expressed as : # min V / (discount*P-I) * V <= - PR