mdp.py 56.7 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
2
3
"""Markov Decision Process (MDP) Toolbox: ``mdp`` module
=====================================================
4

5
6
The ``mdp`` module provides classes for the resolution of descrete-time Markov
Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28
29
30

"""

31
32
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

60
61
import math as _math
import time as _time
Steven Cordwell's avatar
Steven Cordwell committed
62

63
64
import numpy as _np
import scipy.sparse as _sp
Steven Cordwell's avatar
Steven Cordwell committed
65

66
67
68
69
70
try:
    from .util import check, getSpan
except ValueError:
    # importing mdp as a module rather than as part of a package
    from util import check, getSpan
71

72
_MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
Steven Cordwell's avatar
Steven Cordwell committed
73
    "condition."
74
_MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
Steven Cordwell's avatar
Steven Cordwell committed
75
    "policy found."
76
_MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
Steven Cordwell's avatar
Steven Cordwell committed
77
    "function found."
78
_MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
Steven Cordwell's avatar
Steven Cordwell committed
79

Steven Cordwell's avatar
Steven Cordwell committed
80
class MDP(object):
81
    
Steven Cordwell's avatar
Steven Cordwell committed
82
83
    """A Markov Decision Problem.
    
Steven Cordwell's avatar
Steven Cordwell committed
84
    Let ``S`` = the number of states, and ``A`` = the number of acions.
85
    
Steven Cordwell's avatar
Steven Cordwell committed
86
87
88
    Parameters
    ----------
    transitions : array
89
        Transition probability matrices. These can be defined in a variety of 
Steven Cordwell's avatar
Steven Cordwell committed
90
        ways. The simplest is a numpy array that has the shape ``(A, S, S)``,
91
        though there are other possibilities. It can be a tuple or list or
Steven Cordwell's avatar
Steven Cordwell committed
92
93
94
95
96
97
98
        numpy object array of length ``A``, where each element contains a numpy
        array or matrix that has the shape ``(S, S)``. This "list of matrices"
        form is useful when the transition matrices are sparse as
        ``scipy.sparse.csr_matrix`` matrices can be used. In summary, each
        action's transition matrix must be indexable like ``transitions[a]``
        where ``a`` ∈ {0, 1...A-1}, and ``transitions[a]`` returns an ``S`` ×
        ``S`` array-like object.
Steven Cordwell's avatar
Steven Cordwell committed
99
    reward : array
100
101
        Reward matrices or vectors. Like the transition matrices, these can
        also be defined in a variety of ways. Again the simplest is a numpy
Steven Cordwell's avatar
Steven Cordwell committed
102
103
104
105
106
107
108
109
        array that has the shape ``(S, A)``, ``(S,)`` or ``(A, S, S)``. A list
        of lists can be used, where each inner list has length ``S`` and the
        outer list has length ``A``. A list of numpy arrays is possible where
        each inner array can be of the shape ``(S,)``, ``(S, 1)``, ``(1, S)``
        or ``(S, S)``. Also ``scipy.sparse.csr_matrix`` can be used instead of
        numpy arrays. In addition, the outer list can be replaced by any object
        that can be indexed like ``reward[a]`` such as a tuple or numpy object
        array of length ``A``.
110
111
112
113
    discount : float
        Discount factor. The per time-step discount factor on future rewards.
        Valid values are greater than 0 upto and including 1. If the discount
        factor is 1, then convergence is cannot be assumed and a warning will
Steven Cordwell's avatar
Steven Cordwell committed
114
115
        be displayed. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a discount factor.
116
117
118
119
    epsilon : float
        Stopping criterion. The maximum change in the value function at each
        iteration is compared against ``epsilon``. Once the change falls below
        this value, then the value function is considered to have converged to
Steven Cordwell's avatar
Steven Cordwell committed
120
121
122
        the optimal value function. Subclasses of ``MDP`` may pass ``None`` in
        the case where the algorithm does not use an epsilon-optimal stopping
        criterion.
123
124
125
    max_iter : int
        Maximum number of iterations. The algorithm will be terminated once
        this many iterations have elapsed. This must be greater than 0 if
Steven Cordwell's avatar
Steven Cordwell committed
126
127
        specified. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a maximum number of iterations.
Steven Cordwell's avatar
Steven Cordwell committed
128
129
130
131
    
    Attributes
    ----------
    P : array
132
        Transition probability matrices.
Steven Cordwell's avatar
Steven Cordwell committed
133
    R : array
134
135
        Reward vectors.
    V : tuple
Steven Cordwell's avatar
Steven Cordwell committed
136
137
138
        The optimal value function. Each element is a float corresponding to
        the expected value of being in that state assuming the optimal policy
        is followed.
Steven Cordwell's avatar
Steven Cordwell committed
139
    discount : float
140
        The discount rate on future rewards.
Steven Cordwell's avatar
Steven Cordwell committed
141
    max_iter : int
142
143
144
        The maximum number of iterations.
    policy : tuple
        The optimal policy.
Steven Cordwell's avatar
Steven Cordwell committed
145
    time : float
146
147
        The time used to converge to the optimal policy.
    verbose : boolean
Steven Cordwell's avatar
Steven Cordwell committed
148
        Whether verbose output should be displayed or not.
Steven Cordwell's avatar
Steven Cordwell committed
149
150
151
    
    Methods
    -------
152
    run
Steven Cordwell's avatar
Steven Cordwell committed
153
        Implemented in child classes as the main algorithm loop. Raises an
154
        exception if it has not been overridden.
Steven Cordwell's avatar
Steven Cordwell committed
155
156
157
158
159
160
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
161
    
162
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
163
        # Initialise a MDP based on the input parameters.
164
        
Steven Cordwell's avatar
Steven Cordwell committed
165
166
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
167
168
169
170
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
171
                print("WARNING: check conditions of convergence. With no "
Steven Cordwell's avatar
Steven Cordwell committed
172
                      "discount, convergence can not be assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
173
174
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
175
176
177
178
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
179
        # check that epsilon is something sane
180
181
182
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
183
184
185
186
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
187
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
188
189
190
191
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
192
193
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
194
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
195
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
196
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
197
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
198
    
199
200
201
202
203
204
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
205
        return(P_repr + "\n" + R_repr)
206
    
207
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
208
        # Apply the Bellman operator on the value function.
209
        # 
Steven Cordwell's avatar
Steven Cordwell committed
210
        # Updates the value function and the Vprev-improving policy.
211
        # 
Steven Cordwell's avatar
Steven Cordwell committed
212
213
214
215
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
216
217
        if V is None:
            # this V should be a reference to the data rather than a copy
218
219
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
220
            # make sure the user supplied V is of the right shape
221
            try:
222
223
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
224
            except AttributeError:
225
                raise TypeError("V must be a numpy array or matrix.")
226
227
228
229
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
230
        Q = _np.empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
231
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
232
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
233
        # Get the policy and value, for now it is being returned but...
234
        # Which way is better?
235
        # 1. Return, (policy, value)
236
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
237
238
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
239
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
240
    
241
242
243
244
245
246
247
248
249
250
251
252
    def _computeP(self, P):
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
        self.A = len(P)
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
            else:
               self.S = P[0].shape[0]
        except AttributeError:
            self.S = P[0].shape[0]
        # convert P to a tuple of numpy arrays
253
        self.P = tuple(P[aa] for aa in range(self.A))
254
    
255
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
271
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
272
        # check(P, R) has already been run and doesn't fail.
273
        #
274
275
        # First compute store P, S, and A
        self._computeP(P)
Steven Cordwell's avatar
Steven Cordwell committed
276
277
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
278
        try:
279
            if R.ndim == 1:
280
                r = _np.array(R).reshape(self.S)
281
                self.R = tuple(r for aa in range(self.A))
282
            elif R.ndim == 2:
283
                self.R = tuple(_np.array(R[:, aa]).reshape(self.S)
284
                                for aa in range(self.A))
Steven Cordwell's avatar
Steven Cordwell committed
285
            else:
286
                self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
287
                                for aa in range(self.A))
288
        except AttributeError:
289
            if len(R) == self.A:
290
                self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
291
                                for aa in range(self.A))
292
            else:
293
                r = _np.array(R).reshape(self.S)
294
                self.R = tuple(r for aa in range(self.A))
295
    
296
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
297
        # Raise error because child classes should implement this function.
298
        raise NotImplementedError("You should create a run() method.")
Steven Cordwell's avatar
Steven Cordwell committed
299
    
Steven Cordwell's avatar
Steven Cordwell committed
300
    def setSilent(self):
301
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
302
303
304
        self.verbose = False
    
    def setVerbose(self):
305
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
306
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
307
308

class FiniteHorizon(MDP):
309
    
Steven Cordwell's avatar
Steven Cordwell committed
310
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
311
    
Steven Cordwell's avatar
Steven Cordwell committed
312
313
    Parameters
    ----------
314
315
316
317
318
319
320
321
322
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
323
324
325
326
    N : int
        Number of periods. Must be greater than 0.
    h : array, optional
        Terminal reward. Default: a vector of zeros.
Steven Cordwell's avatar
Steven Cordwell committed
327
    
Steven Cordwell's avatar
Steven Cordwell committed
328
329
330
331
    Data Attributes
    ---------------
    V : array 
        Optimal value function. Shape = (S, N+1). ``V[:, n]`` = optimal value
Steven Cordwell's avatar
Steven Cordwell committed
332
        function at stage ``n`` with stage in {0, 1...N-1}. ``V[:, N]`` value
Steven Cordwell's avatar
Steven Cordwell committed
333
334
335
        function for terminal stage. 
    policy : array
        Optimal policy. ``policy[:, n]`` = optimal policy at stage ``n`` with
Steven Cordwell's avatar
Steven Cordwell committed
336
        stage in {0, 1...N}. ``policy[:, N]`` = policy for stage ``N``.
Steven Cordwell's avatar
Steven Cordwell committed
337
338
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
339
340
341
342
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
343
    
Steven Cordwell's avatar
Steven Cordwell committed
344
345
    Examples
    --------
346
347
348
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
349
    >>> fh.run()
Steven Cordwell's avatar
Steven Cordwell committed
350
351
352
353
354
355
356
357
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
358
    
Steven Cordwell's avatar
Steven Cordwell committed
359
    """
Steven Cordwell's avatar
Steven Cordwell committed
360

Steven Cordwell's avatar
Steven Cordwell committed
361
    def __init__(self, transitions, reward, discount, N, h=None):
362
        # Initialise a finite horizon MDP.
363
        self.N = int(N)
Steven Cordwell's avatar
Steven Cordwell committed
364
        assert self.N > 0, "N must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
365
        # Initialise the base class
366
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
367
368
        # remove the iteration counter, it is not meaningful for backwards
        # induction
369
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
370
        # There are value vectors for each time step up to the horizon
371
        self.V = _np.zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
372
373
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
374
        self.policy = _np.empty((self.S, N), dtype=int)
Steven Cordwell's avatar
Steven Cordwell committed
375
376
        # Set the reward for the final transition to h, if specified.
        if h is not None:
377
            self.V[:, N] = h
378
        # Call the iteration method
379
        #self.run()
380
        
381
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
382
        # Run the finite horizon algorithm.
383
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
384
        # loop through each time period
385
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
386
            W, X = self._bellmanOperator(self.V[:, self.N - n])
Steven Cordwell's avatar
Steven Cordwell committed
387
388
389
            stage = self.N - n - 1
            self.V[:, stage] = X
            self.policy[:, stage] = W
Steven Cordwell's avatar
Steven Cordwell committed
390
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
391
392
                print(("stage: %s, policy: %s") % (
                    stage, self.policy[:, stage].tolist()))
Steven Cordwell's avatar
Steven Cordwell committed
393
        # update time spent running
394
        self.time = _time.time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
395
396
        # After this we could create a tuple of tuples for the values and 
        # policies.
Steven Cordwell's avatar
Steven Cordwell committed
397
398
399
        #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
        #self.policy = tuple(tuple(self.policy[:, n].tolist())
        #                    for n in range(self.N))
Steven Cordwell's avatar
Steven Cordwell committed
400
401

class LP(MDP):
402
    
403
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
404
405
    
    This class requires the Python ``cvxopt`` module to be installed.
Steven Cordwell's avatar
Steven Cordwell committed
406
407
408

    Arguments
    ---------
409
410
411
412
413
414
415
416
417
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
418
419
    h : array, optional
        Terminal reward. Default: a vector of zeros.
Steven Cordwell's avatar
Steven Cordwell committed
420
    
Steven Cordwell's avatar
Steven Cordwell committed
421
422
423
424
425
426
427
428
    Data Attributes
    ---------------
    V : tuple
        optimal values
    policy : tuple
        optimal policy
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
429
430
431
    
    Examples
    --------
432
433
434
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
435
    >>> lp.run()
436
    
Steven Cordwell's avatar
Steven Cordwell committed
437
    """
Steven Cordwell's avatar
Steven Cordwell committed
438

Steven Cordwell's avatar
Steven Cordwell committed
439
    def __init__(self, transitions, reward, discount):
440
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
441
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
442
443
        try:
            from cvxopt import matrix, solvers
444
445
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
446
        except ImportError:
447
448
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
449
        # initialise the MDP. epsilon and max_iter are not needed
450
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
451
        # Set the cvxopt solver to be quiet by default, but ...
452
        # this doesn't do what I want it to do c.f. issue #3
453
454
        if not self.verbose:
            solvers.options['show_progress'] = False
455
        # Call the iteration method
456
        #self.run()
457
    
458
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
459
        #Run the linear programming algorithm.
460
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
461
        # The objective is to resolve : min V / V >= PR + discount*P*V
462
463
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
464
        # min f'* x / M * x <= b
465
466
467
468
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
469
        f = self._cvxmat(_np.ones((self.S, 1)))
470
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
471
        M = _np.zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
472
473
        for aa in range(self.A):
            pos = (aa + 1) * self.S
474
            M[(pos - self.S):pos, :] = (
475
                self.discount * self.P[aa] - _sp.eye(self.S, self.S))
476
        M = self._cvxmat(M)
477
478
479
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
480
        # only to 10e-8 places. This assumes glpk is installed of course.
481
        self.V = _np.array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
482
        # apply the Bellman operator
483
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
484
        # update the time spent solving
485
        self.time = _time.time() - self.time
486
        # store value and policy as tuples
487
488
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
489
490

class PolicyIteration(MDP):
491
    
492
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
493
    
Steven Cordwell's avatar
Steven Cordwell committed
494
495
    Arguments
    ---------
496
497
498
499
500
501
502
503
504
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details. 
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
505
506
507
    policy0 : array, optional
        Starting policy.
    max_iter : int, optional
508
509
        Maximum number of iterations. See the documentation for the ``MDP``
        class for details. Default is 1000.
Steven Cordwell's avatar
Steven Cordwell committed
510
511
512
513
    eval_type : int or string, optional
        Type of function used to evaluate policy. 0 or "matrix" to solve as a
        set of linear equations. 1 or "iterative" to solve iteratively.
        Default: 0.
Steven Cordwell's avatar
Steven Cordwell committed
514
             
Steven Cordwell's avatar
Steven Cordwell committed
515
516
517
518
519
520
521
522
523
524
    Data Attributes
    ---------------
    V : tuple
        value function 
    policy : tuple
        optimal policy
    iter : int
        number of done iterations
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
525
526
527
528
529
530
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
531
532
    Examples
    --------
533
534
535
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
536
    >>> pi.run()
537
    
538
539
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
540
    >>> pi.run()
Steven Cordwell's avatar
Steven Cordwell committed
541
    >>> pi.V
542
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
543
    >>> pi.policy
544
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
545
    """
Steven Cordwell's avatar
Steven Cordwell committed
546
    
547
548
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
549
550
551
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
552
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
553
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
554
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
555
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
556
            # immediate reward
557
            null = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
558
            self.policy, null = self._bellmanOperator(null)
559
            del null
Steven Cordwell's avatar
Steven Cordwell committed
560
        else:
Steven Cordwell's avatar
Steven Cordwell committed
561
562
            # Use the policy that the user supplied
            # Make sure it is a numpy array
563
            policy0 = _np.array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
564
            # Make sure the policy is the right size and shape
565
566
            assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'policy0' must a vector with length S."
Steven Cordwell's avatar
Steven Cordwell committed
567
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
568
            policy0 = policy0.reshape(self.S)
569
570
            # The policy can only contain integers between 0 and S-1
            msg = "'policy0' must be a vector of integers between 0 and S-1."
571
            assert not _np.mod(policy0, 1).any(), msg
572
573
574
            assert (policy0 >= 0).all(), msg
            assert (policy0 < self.S).all(), msg
            self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
575
        # set the initial values to zero
576
        self.V = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
577
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
578
579
580
581
582
        if eval_type in (0, "matrix"):
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
Steven Cordwell's avatar
Steven Cordwell committed
583
584
585
            raise ValueError("'eval_type' should be '0' for matrix evaluation "
                             "or '1' for iterative evaluation. The strings "
                             "'matrix' and 'iterative' can also be used.")
586
        # Call the iteration method
587
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
588
    
589
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
609
610
        Ppolicy = _np.empty((self.S, self.S))
        Rpolicy = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
611
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
612
613
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
614
615
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
616
617
618
619
                try:
                    Ppolicy[ind, :] = self.P[aa][ind, :]
                except ValueError:
                    Ppolicy[ind, :] = self.P[aa][ind, :].todense()
620
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
621
622
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
623
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
624
625
626
627
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
628
629
        if type(self.R) is _sp.csr_matrix:
            Rpolicy = _sp.csr_matrix(Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
630
631
632
633
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
634
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
665
666
667
        try:
            assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'V0' must be a vector of length S."
668
            policy_V = _np.array(V0).reshape(self.S)
669
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
670
            if V0 == 0:
671
                policy_V = _np.zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
672
            else:
673
                policy_V = _np.array(V0).reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
674
        
675
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
676
677
        
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
678
            print('    Iteration\t\t    V variation')
679
        
Steven Cordwell's avatar
Steven Cordwell committed
680
681
682
        itr = 0
        done = False
        while not done:
683
            itr += 1
684
685
            
            Vprev = policy_V
686
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
687
            
688
            variation = _np.absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
689
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
690
                print(('      %s\t\t      %s') % (itr, variation))
691
692
693
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
694
695
                done = True
                if self.verbose:
696
                    print(_MSG_STOP_EPSILON_OPTIMAL_VALUE)
Steven Cordwell's avatar
Steven Cordwell committed
697
698
699
            elif itr == max_iter:
                done = True
                if self.verbose:
700
                    print(_MSG_STOP_MAX_ITER)
Steven Cordwell's avatar
Steven Cordwell committed
701
        
Steven Cordwell's avatar
Steven Cordwell committed
702
        self.V = policy_V
703
    
704
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
724
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
725
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
726
727
        self.V = _np.linalg.solve(
            (_sp.eye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
728
    
729
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
730
731
        # Run the policy iteration algorithm.
        # If verbose the print a header
732
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
733
            print('  Iteration\t\tNumber of different actions')
Steven Cordwell's avatar
Steven Cordwell committed
734
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
735
        done = False
736
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
737
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
738
        while not done:
739
            self.iter += 1
740
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
741
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
742
            if self.eval_type == "matrix":
743
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
744
            elif self.eval_type == "iterative":
745
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
746
747
            # This should update the classes policy attribute but leave the
            # value alone
748
            policy_next, null = self._bellmanOperator()
749
            del null
Steven Cordwell's avatar
Steven Cordwell committed
750
751
            # calculate in how many places does the old policy disagree with
            # the new policy
752
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
753
            # if verbose then continue printing a table
754
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
755
                print(('    %s\t\t  %s') % (self.iter, n_different))
Steven Cordwell's avatar
Steven Cordwell committed
756
757
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
758
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
759
                done = True
760
                if self.verbose:
761
                    print(_MSG_STOP_UNCHANGING_POLICY)
762
763
764
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
765
                    print(_MSG_STOP_MAX_ITER)
766
767
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
768
        # update the time to return th computation time
769
        self.time = _time.time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
770
        # store value and policy as tuples
771
772
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
773

774
class PolicyIterationModified(PolicyIteration):
775
    
776
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
777
778
779
    
    Arguments
    ---------
780
781
782
783
784
785
786
787
788
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
789
790
791
792
    epsilon : float, optional
        Stopping criterion. See the documentation for the ``MDP`` class for
        details. Default: 0.01.
    max_iter : int, optional
793
        Maximum number of iterations. See the documentation for the ``MDP``
Steven Cordwell's avatar
Steven Cordwell committed
794
        class for details. Default is 10.
Steven Cordwell's avatar
Steven Cordwell committed
795
796
797
    
    Data Attributes
    ---------------
Steven Cordwell's avatar
Steven Cordwell committed
798
799
800
801
802
803
804
805
    V : tuple
        value function 
    policy : tuple
        optimal policy
    iter : int
        number of done iterations
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
806
807
808
    
    Examples
    --------
809
810
811
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
812
    >>> pim.run()
813
    >>> pim.policy
814
    (0, 0, 0)
815
    >>> pim.V
816
    (21.81408652334702, 25.054086523347017, 29.054086523347017)
817
    
Steven Cordwell's avatar
Steven Cordwell committed
818
    """
819
    
820
821
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
822
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
823
        
824
825
826
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
827
        # is needed from the PolicyIteration class is the _evalPolicyIterative
828
        # function. Perhaps there is a better way to do it?
829
830
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
831
        
832
833
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
834
835
        self.epsilon = float(epsilon)
        assert epsilon > 0, "'epsilon' must be greater than 0."
836
        
837
838
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
839
        if self.discount != 1:
840
            self.thresh = self.epsilon * (1 - self.discount) / self.discount
841
        else:
842
            self.thresh = self.epsilon
843
        
844
        if self.discount == 1:
845
            self.V = _np.zeros((self.S, 1))
846
        else:
847
            Rmin = min(R.min() for R in self.R)
848
            self.V = 1 / (1 - self.discount) * Rmin * _np.ones((self.S,))
849
850
        
        # Call the iteration method
851
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
852
    
853
    def run(self):
854
        # Run the modified policy iteration algorithm.
855
856
        
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
857
            print('  \tIteration\t\tV-variation')
858
        
859
        self.time = _time.time()
Steven Cordwell's avatar
Steven Cordwell committed
860
        
861
862
        done = False
        while not done:
863
            self.iter += 1
864
            
865
            self.policy, Vnext = self._bellmanOperator()
866
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
867
            
868
            variation = getSpan(Vnext - self.V)
869
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
870
                print(("    %s\t\t  %s" % (self.iter, variation)))
871
            
Steven Cordwell's avatar
Steven Cordwell committed
872
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
873
            if variation < self.thresh:
874
875
876
877
                done = True
            else:
                is_verbose = False
                if self.verbose:
878
                    self.setSilent()
879
880
                    is_verbose = True