mdp.py 54.9 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
2 3
"""Markov Decision Process (MDP) Toolbox: ``mdp`` module
=====================================================
4

5 6
The ``mdp`` module provides classes for the resolution of descrete-time Markov
Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8 9 10 11 12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14 15 16 17 18 19 20 21 22 23 24 25 26 27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28 29 30

"""

31 32
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
33
#
Steven Cordwell's avatar
Steven Cordwell committed
34
# All rights reserved.
35
#
Steven Cordwell's avatar
Steven Cordwell committed
36 37
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
38
#
Steven Cordwell's avatar
Steven Cordwell committed
39 40 41 42 43 44 45 46
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
47
#
Steven Cordwell's avatar
Steven Cordwell committed
48 49 50 51 52 53 54 55 56 57 58 59
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

60 61
import math as _math
import time as _time
Steven Cordwell's avatar
Steven Cordwell committed
62

63 64
import numpy as _np
import scipy.sparse as _sp
Steven Cordwell's avatar
Steven Cordwell committed
65

66 67 68 69 70
try:
    from .util import check, getSpan
except ValueError:
    # importing mdp as a module rather than as part of a package
    from util import check, getSpan
71

72
_MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
Steven Cordwell's avatar
Steven Cordwell committed
73
    "condition."
74
_MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
Steven Cordwell's avatar
Steven Cordwell committed
75
    "policy found."
76
_MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
Steven Cordwell's avatar
Steven Cordwell committed
77
    "function found."
78
_MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
Steven Cordwell's avatar
Steven Cordwell committed
79

Steven Cordwell's avatar
Steven Cordwell committed
80
class MDP(object):
81

Steven Cordwell's avatar
Steven Cordwell committed
82
    """A Markov Decision Problem.
83

Steven Cordwell's avatar
Steven Cordwell committed
84
    Let ``S`` = the number of states, and ``A`` = the number of acions.
85

Steven Cordwell's avatar
Steven Cordwell committed
86 87 88
    Parameters
    ----------
    transitions : array
89
        Transition probability matrices. These can be defined in a variety of
Steven Cordwell's avatar
Steven Cordwell committed
90
        ways. The simplest is a numpy array that has the shape ``(A, S, S)``,
91
        though there are other possibilities. It can be a tuple or list or
Steven Cordwell's avatar
Steven Cordwell committed
92 93 94 95 96 97 98
        numpy object array of length ``A``, where each element contains a numpy
        array or matrix that has the shape ``(S, S)``. This "list of matrices"
        form is useful when the transition matrices are sparse as
        ``scipy.sparse.csr_matrix`` matrices can be used. In summary, each
        action's transition matrix must be indexable like ``transitions[a]``
        where ``a`` ∈ {0, 1...A-1}, and ``transitions[a]`` returns an ``S`` ×
        ``S`` array-like object.
Steven Cordwell's avatar
Steven Cordwell committed
99
    reward : array
100 101
        Reward matrices or vectors. Like the transition matrices, these can
        also be defined in a variety of ways. Again the simplest is a numpy
Steven Cordwell's avatar
Steven Cordwell committed
102 103 104 105 106 107 108 109
        array that has the shape ``(S, A)``, ``(S,)`` or ``(A, S, S)``. A list
        of lists can be used, where each inner list has length ``S`` and the
        outer list has length ``A``. A list of numpy arrays is possible where
        each inner array can be of the shape ``(S,)``, ``(S, 1)``, ``(1, S)``
        or ``(S, S)``. Also ``scipy.sparse.csr_matrix`` can be used instead of
        numpy arrays. In addition, the outer list can be replaced by any object
        that can be indexed like ``reward[a]`` such as a tuple or numpy object
        array of length ``A``.
110 111 112 113
    discount : float
        Discount factor. The per time-step discount factor on future rewards.
        Valid values are greater than 0 upto and including 1. If the discount
        factor is 1, then convergence is cannot be assumed and a warning will
Steven Cordwell's avatar
Steven Cordwell committed
114 115
        be displayed. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a discount factor.
116 117 118 119
    epsilon : float
        Stopping criterion. The maximum change in the value function at each
        iteration is compared against ``epsilon``. Once the change falls below
        this value, then the value function is considered to have converged to
Steven Cordwell's avatar
Steven Cordwell committed
120 121 122
        the optimal value function. Subclasses of ``MDP`` may pass ``None`` in
        the case where the algorithm does not use an epsilon-optimal stopping
        criterion.
123 124 125
    max_iter : int
        Maximum number of iterations. The algorithm will be terminated once
        this many iterations have elapsed. This must be greater than 0 if
Steven Cordwell's avatar
Steven Cordwell committed
126 127
        specified. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a maximum number of iterations.
128

Steven Cordwell's avatar
Steven Cordwell committed
129 130 131
    Attributes
    ----------
    P : array
132
        Transition probability matrices.
Steven Cordwell's avatar
Steven Cordwell committed
133
    R : array
134 135
        Reward vectors.
    V : tuple
Steven Cordwell's avatar
Steven Cordwell committed
136 137 138
        The optimal value function. Each element is a float corresponding to
        the expected value of being in that state assuming the optimal policy
        is followed.
Steven Cordwell's avatar
Steven Cordwell committed
139
    discount : float
140
        The discount rate on future rewards.
Steven Cordwell's avatar
Steven Cordwell committed
141
    max_iter : int
142 143 144
        The maximum number of iterations.
    policy : tuple
        The optimal policy.
Steven Cordwell's avatar
Steven Cordwell committed
145
    time : float
146 147
        The time used to converge to the optimal policy.
    verbose : boolean
Steven Cordwell's avatar
Steven Cordwell committed
148
        Whether verbose output should be displayed or not.
149

Steven Cordwell's avatar
Steven Cordwell committed
150 151
    Methods
    -------
152
    run
Steven Cordwell's avatar
Steven Cordwell committed
153
        Implemented in child classes as the main algorithm loop. Raises an
154
        exception if it has not been overridden.
Steven Cordwell's avatar
Steven Cordwell committed
155 156 157 158
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
159

Steven Cordwell's avatar
Steven Cordwell committed
160
    """
161

162
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
163
        # Initialise a MDP based on the input parameters.
164

Steven Cordwell's avatar
Steven Cordwell committed
165 166
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
167 168 169 170
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
171
                print("WARNING: check conditions of convergence. With no "
Steven Cordwell's avatar
Steven Cordwell committed
172
                      "discount, convergence can not be assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
173 174
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
175 176 177 178
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
179
        # check that epsilon is something sane
180 181 182
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
183 184 185 186
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
187
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
188 189 190 191
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
192 193
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
194
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
195
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
196
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
197
        self.policy = None
198

199 200 201 202 203 204
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
205
        return(P_repr + "\n" + R_repr)
206

207
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
208
        # Apply the Bellman operator on the value function.
209
        #
Steven Cordwell's avatar
Steven Cordwell committed
210
        # Updates the value function and the Vprev-improving policy.
211
        #
Steven Cordwell's avatar
Steven Cordwell committed
212 213 214 215
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
216 217
        if V is None:
            # this V should be a reference to the data rather than a copy
218 219
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
220
            # make sure the user supplied V is of the right shape
221
            try:
222 223
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
224
            except AttributeError:
225
                raise TypeError("V must be a numpy array or matrix.")
226 227 228 229
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
230
        Q = _np.empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
231
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
232
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
233
        # Get the policy and value, for now it is being returned but...
234
        # Which way is better?
235
        # 1. Return, (policy, value)
236
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
237 238
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
239
        # self.policy = Q.argmax(axis=1)
240

241 242 243 244 245 246 247 248
    def _computeP(self, P):
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
        self.A = len(P)
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
            else:
249
                self.S = P[0].shape[0]
250 251 252
        except AttributeError:
            self.S = P[0].shape[0]
        # convert P to a tuple of numpy arrays
253
        self.P = tuple(P[aa] for aa in range(self.A))
254

255
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
256 257 258 259
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
260 261
        #    P(SxSxA)  = transition matrix
        #        P could be an array with 3 dimensions or  a cell array (1xA),
Steven Cordwell's avatar
Steven Cordwell committed
262 263
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
264 265 266
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        #        array(SxA) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
267 268 269 270
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
271
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
272
        # check(P, R) has already been run and doesn't fail.
273
        #
274 275
        # First compute store P, S, and A
        self._computeP(P)
Steven Cordwell's avatar
Steven Cordwell committed
276 277
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
278
        try:
279
            if R.ndim == 1:
280
                r = _np.array(R).reshape(self.S)
281
                self.R = tuple(r for aa in range(self.A))
282
            elif R.ndim == 2:
283
                self.R = tuple(_np.array(R[:, aa]).reshape(self.S)
284
                               for aa in range(self.A))
Steven Cordwell's avatar
Steven Cordwell committed
285
            else:
286
                self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
287
                               for aa in range(self.A))
288
        except AttributeError:
289
            if len(R) == self.A:
290
                self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
291
                               for aa in range(self.A))
292
            else:
293
                r = _np.array(R).reshape(self.S)
294
                self.R = tuple(r for aa in range(self.A))
295

296
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
297
        # Raise error because child classes should implement this function.
298
        raise NotImplementedError("You should create a run() method.")
299

Steven Cordwell's avatar
Steven Cordwell committed
300
    def setSilent(self):
301
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
302
        self.verbose = False
303

Steven Cordwell's avatar
Steven Cordwell committed
304
    def setVerbose(self):
305
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
306
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
307 308

class FiniteHorizon(MDP):
309

Steven Cordwell's avatar
Steven Cordwell committed
310
    """A MDP solved using the finite-horizon backwards induction algorithm.
311

Steven Cordwell's avatar
Steven Cordwell committed
312 313
    Parameters
    ----------
314 315 316 317 318 319 320 321 322
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
323 324 325 326
    N : int
        Number of periods. Must be greater than 0.
    h : array, optional
        Terminal reward. Default: a vector of zeros.
327

Steven Cordwell's avatar
Steven Cordwell committed
328 329
    Data Attributes
    ---------------
330
    V : array
Steven Cordwell's avatar
Steven Cordwell committed
331
        Optimal value function. Shape = (S, N+1). ``V[:, n]`` = optimal value
Steven Cordwell's avatar
Steven Cordwell committed
332
        function at stage ``n`` with stage in {0, 1...N-1}. ``V[:, N]`` value
333
        function for terminal stage.
Steven Cordwell's avatar
Steven Cordwell committed
334 335
    policy : array
        Optimal policy. ``policy[:, n]`` = optimal policy at stage ``n`` with
Steven Cordwell's avatar
Steven Cordwell committed
336
        stage in {0, 1...N}. ``policy[:, N]`` = policy for stage ``N``.
Steven Cordwell's avatar
Steven Cordwell committed
337 338
    time : float
        used CPU time
339

Steven Cordwell's avatar
Steven Cordwell committed
340 341 342
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
343

Steven Cordwell's avatar
Steven Cordwell committed
344 345
    Examples
    --------
346 347 348
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
349
    >>> fh.run()
Steven Cordwell's avatar
Steven Cordwell committed
350 351 352 353 354 355 356 357
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
358

Steven Cordwell's avatar
Steven Cordwell committed
359
    """
Steven Cordwell's avatar
Steven Cordwell committed
360

Steven Cordwell's avatar
Steven Cordwell committed
361
    def __init__(self, transitions, reward, discount, N, h=None):
362
        # Initialise a finite horizon MDP.
363
        self.N = int(N)
Steven Cordwell's avatar
Steven Cordwell committed
364
        assert self.N > 0, "N must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
365
        # Initialise the base class
366
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
367 368
        # remove the iteration counter, it is not meaningful for backwards
        # induction
369
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
370
        # There are value vectors for each time step up to the horizon
371
        self.V = _np.zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
372 373
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
374
        self.policy = _np.empty((self.S, N), dtype=int)
Steven Cordwell's avatar
Steven Cordwell committed
375 376
        # Set the reward for the final transition to h, if specified.
        if h is not None:
377
            self.V[:, N] = h
378

379
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
380
        # Run the finite horizon algorithm.
381
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
382
        # loop through each time period
383
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
384
            W, X = self._bellmanOperator(self.V[:, self.N - n])
Steven Cordwell's avatar
Steven Cordwell committed
385 386 387
            stage = self.N - n - 1
            self.V[:, stage] = X
            self.policy[:, stage] = W
Steven Cordwell's avatar
Steven Cordwell committed
388
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
389 390
                print(("stage: %s, policy: %s") % (
                    stage, self.policy[:, stage].tolist()))
Steven Cordwell's avatar
Steven Cordwell committed
391
        # update time spent running
392
        self.time = _time.time() - self.time
393
        # After this we could create a tuple of tuples for the values and
Steven Cordwell's avatar
Steven Cordwell committed
394
        # policies.
Steven Cordwell's avatar
Steven Cordwell committed
395 396 397
        #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
        #self.policy = tuple(tuple(self.policy[:, n].tolist())
        #                    for n in range(self.N))
Steven Cordwell's avatar
Steven Cordwell committed
398 399

class LP(MDP):
400

401
    """A discounted MDP soloved using linear programming.
402

Steven Cordwell's avatar
Steven Cordwell committed
403
    This class requires the Python ``cvxopt`` module to be installed.
Steven Cordwell's avatar
Steven Cordwell committed
404 405 406

    Arguments
    ---------
407 408 409 410 411 412 413 414 415
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
416 417
    h : array, optional
        Terminal reward. Default: a vector of zeros.
418

Steven Cordwell's avatar
Steven Cordwell committed
419 420 421 422 423 424 425 426
    Data Attributes
    ---------------
    V : tuple
        optimal values
    policy : tuple
        optimal policy
    time : float
        used CPU time
427

Steven Cordwell's avatar
Steven Cordwell committed
428 429
    Examples
    --------
430
    >>> import mdptoolbox.example
431 432
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
433
    >>> lp.run()
434 435 436 437 438 439 440 441

    >>> import numpy, mdptoolbox
    >>> P = numpy.array((((0.5, 0.5), (0.8, 0.2)), ((0, 1), (0.1, 0.9))))
    >>> R = numpy.array(((5, 10), (-1, 2)))
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
    >>> lp.run()
    >>> #lp.policy #FIXME: gives (1, 1), should be (1, 0)

Steven Cordwell's avatar
Steven Cordwell committed
442
    """
Steven Cordwell's avatar
Steven Cordwell committed
443

Steven Cordwell's avatar
Steven Cordwell committed
444
    def __init__(self, transitions, reward, discount):
445
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
446
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
447 448
        try:
            from cvxopt import matrix, solvers
449 450
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
451
        except ImportError:
452 453
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
454
        # initialise the MDP. epsilon and max_iter are not needed
455
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
456
        # Set the cvxopt solver to be quiet by default, but ...
457
        # this doesn't do what I want it to do c.f. issue #3
458 459
        if not self.verbose:
            solvers.options['show_progress'] = False
460

461
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
462
        #Run the linear programming algorithm.
463
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
464
        # The objective is to resolve : min V / V >= PR + discount*P*V
465 466
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
467
        # min f'* x / M * x <= b
468 469 470 471
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
472
        f = self._cvxmat(_np.ones((self.S, 1)))
473 474
        h = _np.array(self.R).reshape(self.S * self.A, 1, order="F")
        h = self._cvxmat(h, tc='d')
475
        M = _np.zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
476 477
        for aa in range(self.A):
            pos = (aa + 1) * self.S
478
            M[(pos - self.S):pos, :] = (
479
                self.discount * self.P[aa] - _sp.eye(self.S, self.S))
480
        M = self._cvxmat(M)
481
        # Using the glpk option will make this behave more like Octave
482
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the
483
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
484
        # only to 10e-8 places. This assumes glpk is installed of course.
485
        self.V = _np.array(self._linprog(f, M, -h)['x']).reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
486
        # apply the Bellman operator
487
        self.policy, self.V = self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
488
        # update the time spent solving
489
        self.time = _time.time() - self.time
490
        # store value and policy as tuples
491 492
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
493 494

class PolicyIteration(MDP):
495

496
    """A discounted MDP solved using the policy iteration algorithm.
497

Steven Cordwell's avatar
Steven Cordwell committed
498 499
    Arguments
    ---------
500 501 502 503 504
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
505
        for details.
506 507 508
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
509 510 511
    policy0 : array, optional
        Starting policy.
    max_iter : int, optional
512 513
        Maximum number of iterations. See the documentation for the ``MDP``
        class for details. Default is 1000.
Steven Cordwell's avatar
Steven Cordwell committed
514 515 516 517
    eval_type : int or string, optional
        Type of function used to evaluate policy. 0 or "matrix" to solve as a
        set of linear equations. 1 or "iterative" to solve iteratively.
        Default: 0.
518

Steven Cordwell's avatar
Steven Cordwell committed
519 520 521
    Data Attributes
    ---------------
    V : tuple
522
        value function
Steven Cordwell's avatar
Steven Cordwell committed
523 524 525 526 527 528
    policy : tuple
        optimal policy
    iter : int
        number of done iterations
    time : float
        used CPU time
529

Steven Cordwell's avatar
Steven Cordwell committed
530 531
    Notes
    -----
532
    In verbose mode, at each iteration, displays the number
Steven Cordwell's avatar
Steven Cordwell committed
533
    of differents actions between policy n-1 and n
534

Steven Cordwell's avatar
Steven Cordwell committed
535 536
    Examples
    --------
537
    >>> import mdptoolbox, mdptoolbox.example
538
    >>> P, R = mdptoolbox.example.rand(10, 3)
539
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
540
    >>> pi.run()
541

542 543
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
544
    >>> pi.run()
545 546 547
    >>> expected = (26.244000000000014, 29.484000000000016, 33.484000000000016)
    >>> all(expected[k] - pi.V[k] < 1e-12 for k in range(len(expected)))
    True
Steven Cordwell's avatar
Steven Cordwell committed
548
    >>> pi.policy
549
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
550
    """
551

552 553
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
554 555 556
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
557
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
558
        # Check if the user has supplied an initial policy. If not make one.
559
        if policy0 is None:
Steven Cordwell's avatar
Steven Cordwell committed
560
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
561
            # immediate reward
562
            null = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
563
            self.policy, null = self._bellmanOperator(null)
564
            del null
Steven Cordwell's avatar
Steven Cordwell committed
565
        else:
Steven Cordwell's avatar
Steven Cordwell committed
566 567
            # Use the policy that the user supplied
            # Make sure it is a numpy array
568
            policy0 = _np.array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
569
            # Make sure the policy is the right size and shape
570 571
            assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'policy0' must a vector with length S."
Steven Cordwell's avatar
Steven Cordwell committed
572
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
573
            policy0 = policy0.reshape(self.S)
574 575
            # The policy can only contain integers between 0 and S-1
            msg = "'policy0' must be a vector of integers between 0 and S-1."
576
            assert not _np.mod(policy0, 1).any(), msg
577 578 579
            assert (policy0 >= 0).all(), msg
            assert (policy0 < self.S).all(), msg
            self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
580
        # set the initial values to zero
581
        self.V = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
582
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
583 584 585 586 587
        if eval_type in (0, "matrix"):
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
Steven Cordwell's avatar
Steven Cordwell committed
588 589 590
            raise ValueError("'eval_type' should be '0' for matrix evaluation "
                             "or '1' for iterative evaluation. The strings "
                             "'matrix' and 'iterative' can also be used.")
591

592
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
593 594 595 596 597
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
598
        # P(SxSxA)  = transition matrix
Steven Cordwell's avatar
Steven Cordwell committed
599 600 601
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
602
        #     R could be an array with 3 dimensions (SxSxA) or
Steven Cordwell's avatar
Steven Cordwell committed
603
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
604
        #     a 2D array(SxA) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
605 606 607 608 609 610 611
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
612 613
        Ppolicy = _np.empty((self.S, self.S))
        Rpolicy = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
614
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
615 616
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
617 618
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
619 620 621 622
                try:
                    Ppolicy[ind, :] = self.P[aa][ind, :]
                except ValueError:
                    Ppolicy[ind, :] = self.P[aa][ind, :].todense()
623
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
624 625
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
626
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
627 628 629 630
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
631 632
        if type(self.R) is _sp.csr_matrix:
            Rpolicy = _sp.csr_matrix(Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
633 634 635
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
636

637
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
638 639 640 641 642
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
643 644
        # P(SxSxA)  = transition matrix
        #    P could be an array with 3 dimensions or
Steven Cordwell's avatar
Steven Cordwell committed
645 646
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
647
        #    R could be an array with 3 dimensions (SxSxA) or
Steven Cordwell's avatar
Steven Cordwell committed
648
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
649
        #    a 2D array(SxA) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
650 651 652 653 654
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
655
        # max_iter  = maximum number of iteration to be done, upper than 0,
Steven Cordwell's avatar
Steven Cordwell committed
656
        #    optional (default : 10000)
657
        #
Steven Cordwell's avatar
Steven Cordwell committed
658 659 660 661 662 663 664 665 666 667
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
668 669 670
        try:
            assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'V0' must be a vector of length S."
671
            policy_V = _np.array(V0).reshape(self.S)
672
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
673
            if V0 == 0:
674
                policy_V = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
675
            else:
676
                policy_V = _np.array(V0).reshape(self.S)
677

678
        policy_P, policy_R = self._computePpolicyPRpolicy()
679

Steven Cordwell's avatar
Steven Cordwell committed
680
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
681
            print('    Iteration\t\t    V variation')
682

Steven Cordwell's avatar
Steven Cordwell committed
683 684 685
        itr = 0
        done = False
        while not done:
686
            itr += 1
687

688
            Vprev = policy_V
689
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
690

691
            variation = _np.absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
692
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
693
                print(('      %s\t\t      %s') % (itr, variation))
694

695 696
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
697 698
                done = True
                if self.verbose:
699
                    print(_MSG_STOP_EPSILON_OPTIMAL_VALUE)
Steven Cordwell's avatar
Steven Cordwell committed
700 701 702
            elif itr == max_iter:
                done = True
                if self.verbose:
703
                    print(_MSG_STOP_MAX_ITER)
704

Steven Cordwell's avatar
Steven Cordwell committed
705
        self.V = policy_V
706

707
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
708 709
        # Evaluate the value function of the policy using linear equations.
        #
710
        # Arguments
Steven Cordwell's avatar
Steven Cordwell committed
711 712
        # ---------
        # Let S = number of states, A = number of actions
713
        # P(SxSxA) = transition matrix
Steven Cordwell's avatar
Steven Cordwell committed
714 715 716
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
717
        #      R could be an array with 3 dimensions (SxSxA) or
Steven Cordwell's avatar
Steven Cordwell committed
718
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
719
        #      a 2D array(SxA) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
720 721 722 723 724 725 726
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
727
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
728
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
729 730
        self.V = _np.linalg.solve(
            (_sp.eye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
731

732
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
733 734
        # Run the policy iteration algorithm.
        # If verbose the print a header
735
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
736
            print('  Iteration\t\tNumber of different actions')
Steven Cordwell's avatar
Steven Cordwell committed
737
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
738
        done = False
739
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
740
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
741
        while not done:
742
            self.iter += 1
743
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
744
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
745
            if self.eval_type == "matrix":
746
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
747
            elif self.eval_type == "iterative":
748
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
749 750
            # This should update the classes policy attribute but leave the
            # value alone
751
            policy_next, null = self._bellmanOperator()
752
            del null
Steven Cordwell's avatar
Steven Cordwell committed
753 754
            # calculate in how many places does the old policy disagree with
            # the new policy
755
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
756
            # if verbose then continue printing a table
757
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
758
                print(('    %s\t\t  %s') % (self.iter, n_different))
759
            # Once the policy is unchanging of the maximum number of
Steven Cordwell's avatar
Steven Cordwell committed
760
            # of iterations has been reached then stop
761
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
762
                done = True
763
                if self.verbose:
764
                    print(_MSG_STOP_UNCHANGING_POLICY)
765
            elif self.iter == self.max_iter:
766
                done = True
767
                if self.verbose:
768
                    print(_MSG_STOP_MAX_ITER)
769 770
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
771
        # update the time to return th computation time
772
        self.time = _time.time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
773
        # store value and policy as tuples
774 775
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
776

777
class PolicyIterationModified(PolicyIteration):
778

779
    """A discounted MDP  solved using a modifified policy iteration algorithm.
780

Steven Cordwell's avatar
Steven Cordwell committed
781 782
    Arguments
    ---------
Steven Cordwell's avatar