mdp.py 56.8 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
2
3
"""Markov Decision Process (MDP) Toolbox: ``mdp`` module
=====================================================
4

5
6
The ``mdp`` module provides classes for the resolution of descrete-time Markov
Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28
29
30

"""

31
32
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
60
61
62
from math import ceil, log, sqrt
from time import time

63
from numpy import absolute, array, empty, mean, mod, multiply, ones, zeros
64
from numpy.random import randint, random
Steven Cordwell's avatar
Steven Cordwell committed
65
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
66

67
68
69
70
71
try:
    from .util import check, getSpan
except ValueError:
    # importing mdp as a module rather than as part of a package
    from util import check, getSpan
72

Steven Cordwell's avatar
Steven Cordwell committed
73
74
75
76
77
78
79
80
MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
    "condition."
MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
    "policy found."
MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
    "function found."
MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."

Steven Cordwell's avatar
Steven Cordwell committed
81
class MDP(object):
82
    
Steven Cordwell's avatar
Steven Cordwell committed
83
84
    """A Markov Decision Problem.
    
Steven Cordwell's avatar
Steven Cordwell committed
85
    Let ``S`` = the number of states, and ``A`` = the number of acions.
86
    
Steven Cordwell's avatar
Steven Cordwell committed
87
88
89
    Parameters
    ----------
    transitions : array
90
        Transition probability matrices. These can be defined in a variety of 
Steven Cordwell's avatar
Steven Cordwell committed
91
        ways. The simplest is a numpy array that has the shape ``(A, S, S)``,
92
        though there are other possibilities. It can be a tuple or list or
Steven Cordwell's avatar
Steven Cordwell committed
93
94
95
96
97
98
99
        numpy object array of length ``A``, where each element contains a numpy
        array or matrix that has the shape ``(S, S)``. This "list of matrices"
        form is useful when the transition matrices are sparse as
        ``scipy.sparse.csr_matrix`` matrices can be used. In summary, each
        action's transition matrix must be indexable like ``transitions[a]``
        where ``a`` ∈ {0, 1...A-1}, and ``transitions[a]`` returns an ``S`` ×
        ``S`` array-like object.
Steven Cordwell's avatar
Steven Cordwell committed
100
    reward : array
101
102
        Reward matrices or vectors. Like the transition matrices, these can
        also be defined in a variety of ways. Again the simplest is a numpy
Steven Cordwell's avatar
Steven Cordwell committed
103
104
105
106
107
108
109
110
        array that has the shape ``(S, A)``, ``(S,)`` or ``(A, S, S)``. A list
        of lists can be used, where each inner list has length ``S`` and the
        outer list has length ``A``. A list of numpy arrays is possible where
        each inner array can be of the shape ``(S,)``, ``(S, 1)``, ``(1, S)``
        or ``(S, S)``. Also ``scipy.sparse.csr_matrix`` can be used instead of
        numpy arrays. In addition, the outer list can be replaced by any object
        that can be indexed like ``reward[a]`` such as a tuple or numpy object
        array of length ``A``.
111
112
113
114
    discount : float
        Discount factor. The per time-step discount factor on future rewards.
        Valid values are greater than 0 upto and including 1. If the discount
        factor is 1, then convergence is cannot be assumed and a warning will
Steven Cordwell's avatar
Steven Cordwell committed
115
116
        be displayed. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a discount factor.
117
118
119
120
    epsilon : float
        Stopping criterion. The maximum change in the value function at each
        iteration is compared against ``epsilon``. Once the change falls below
        this value, then the value function is considered to have converged to
Steven Cordwell's avatar
Steven Cordwell committed
121
122
123
        the optimal value function. Subclasses of ``MDP`` may pass ``None`` in
        the case where the algorithm does not use an epsilon-optimal stopping
        criterion.
124
125
126
    max_iter : int
        Maximum number of iterations. The algorithm will be terminated once
        this many iterations have elapsed. This must be greater than 0 if
Steven Cordwell's avatar
Steven Cordwell committed
127
128
        specified. Subclasses of ``MDP`` may pass ``None`` in the case where
        the algorithm does not use a maximum number of iterations.
Steven Cordwell's avatar
Steven Cordwell committed
129
130
131
132
    
    Attributes
    ----------
    P : array
133
        Transition probability matrices.
Steven Cordwell's avatar
Steven Cordwell committed
134
    R : array
135
136
        Reward vectors.
    V : tuple
Steven Cordwell's avatar
Steven Cordwell committed
137
138
139
        The optimal value function. Each element is a float corresponding to
        the expected value of being in that state assuming the optimal policy
        is followed.
Steven Cordwell's avatar
Steven Cordwell committed
140
    discount : float
141
        The discount rate on future rewards.
Steven Cordwell's avatar
Steven Cordwell committed
142
    max_iter : int
143
144
145
        The maximum number of iterations.
    policy : tuple
        The optimal policy.
Steven Cordwell's avatar
Steven Cordwell committed
146
    time : float
147
148
        The time used to converge to the optimal policy.
    verbose : boolean
Steven Cordwell's avatar
Steven Cordwell committed
149
        Whether verbose output should be displayed or not.
Steven Cordwell's avatar
Steven Cordwell committed
150
151
152
    
    Methods
    -------
153
    run
Steven Cordwell's avatar
Steven Cordwell committed
154
        Implemented in child classes as the main algorithm loop. Raises an
155
        exception if it has not been overridden.
Steven Cordwell's avatar
Steven Cordwell committed
156
157
158
159
160
161
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
162
    
163
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
164
        # Initialise a MDP based on the input parameters.
165
        
Steven Cordwell's avatar
Steven Cordwell committed
166
167
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
168
169
170
171
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
172
                print("WARNING: check conditions of convergence. With no "
Steven Cordwell's avatar
Steven Cordwell committed
173
                      "discount, convergence can not be assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
174
175
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
176
177
178
179
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
180
        # check that epsilon is something sane
181
182
183
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
184
185
186
187
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
188
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
189
190
191
192
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
193
194
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
195
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
196
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
197
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
198
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
199
    
200
201
202
203
204
205
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
206
        return(P_repr + "\n" + R_repr)
207
    
208
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
209
        # Apply the Bellman operator on the value function.
210
        # 
Steven Cordwell's avatar
Steven Cordwell committed
211
        # Updates the value function and the Vprev-improving policy.
212
        # 
Steven Cordwell's avatar
Steven Cordwell committed
213
214
215
216
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
217
218
        if V is None:
            # this V should be a reference to the data rather than a copy
219
220
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
221
            # make sure the user supplied V is of the right shape
222
            try:
223
224
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
225
            except AttributeError:
226
                raise TypeError("V must be a numpy array or matrix.")
227
228
229
230
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
Steven Cordwell's avatar
Steven Cordwell committed
231
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
232
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
233
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
234
        # Get the policy and value, for now it is being returned but...
235
        # Which way is better?
236
        # 1. Return, (policy, value)
237
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
238
239
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
240
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
241
    
242
243
244
245
246
247
248
249
250
251
252
253
    def _computeP(self, P):
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
        self.A = len(P)
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
            else:
               self.S = P[0].shape[0]
        except AttributeError:
            self.S = P[0].shape[0]
        # convert P to a tuple of numpy arrays
254
        self.P = tuple(P[aa] for aa in range(self.A))
255
    
256
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
272
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
273
        # check(P, R) has already been run and doesn't fail.
274
        #
275
276
        # First compute store P, S, and A
        self._computeP(P)
Steven Cordwell's avatar
Steven Cordwell committed
277
278
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
279
        try:
280
281
            if R.ndim == 1:
                r = array(R).reshape(self.S)
282
                self.R = tuple(r for aa in range(self.A))
283
            elif R.ndim == 2:
284
285
                self.R = tuple(array(R[:, aa]).reshape(self.S)
                                for aa in range(self.A))
Steven Cordwell's avatar
Steven Cordwell committed
286
            else:
287
288
                self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in range(self.A))
289
        except AttributeError:
290
            if len(R) == self.A:
291
292
                self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in range(self.A))
293
294
            else:
                r = array(R).reshape(self.S)
295
                self.R = tuple(r for aa in range(self.A))
296
    
297
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
298
        # Raise error because child classes should implement this function.
299
        raise NotImplementedError("You should create a run() method.")
Steven Cordwell's avatar
Steven Cordwell committed
300
    
Steven Cordwell's avatar
Steven Cordwell committed
301
    def setSilent(self):
302
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
303
304
305
        self.verbose = False
    
    def setVerbose(self):
306
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
307
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
308
309

class FiniteHorizon(MDP):
310
    
Steven Cordwell's avatar
Steven Cordwell committed
311
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
312
    
Steven Cordwell's avatar
Steven Cordwell committed
313
314
    Parameters
    ----------
315
316
317
318
319
320
321
322
323
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
324
325
326
327
    N : int
        Number of periods. Must be greater than 0.
    h : array, optional
        Terminal reward. Default: a vector of zeros.
Steven Cordwell's avatar
Steven Cordwell committed
328
    
Steven Cordwell's avatar
Steven Cordwell committed
329
330
331
332
    Data Attributes
    ---------------
    V : array 
        Optimal value function. Shape = (S, N+1). ``V[:, n]`` = optimal value
Steven Cordwell's avatar
Steven Cordwell committed
333
        function at stage ``n`` with stage in {0, 1...N-1}. ``V[:, N]`` value
Steven Cordwell's avatar
Steven Cordwell committed
334
335
336
        function for terminal stage. 
    policy : array
        Optimal policy. ``policy[:, n]`` = optimal policy at stage ``n`` with
Steven Cordwell's avatar
Steven Cordwell committed
337
        stage in {0, 1...N}. ``policy[:, N]`` = policy for stage ``N``.
Steven Cordwell's avatar
Steven Cordwell committed
338
339
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
340
341
342
343
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
344
    
Steven Cordwell's avatar
Steven Cordwell committed
345
346
    Examples
    --------
347
348
349
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
350
    >>> fh.run()
Steven Cordwell's avatar
Steven Cordwell committed
351
352
353
354
355
356
357
358
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
359
    
Steven Cordwell's avatar
Steven Cordwell committed
360
    """
Steven Cordwell's avatar
Steven Cordwell committed
361

Steven Cordwell's avatar
Steven Cordwell committed
362
    def __init__(self, transitions, reward, discount, N, h=None):
363
        # Initialise a finite horizon MDP.
364
        self.N = int(N)
Steven Cordwell's avatar
Steven Cordwell committed
365
        assert self.N > 0, "N must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
366
        # Initialise the base class
367
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
368
369
        # remove the iteration counter, it is not meaningful for backwards
        # induction
370
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
371
        # There are value vectors for each time step up to the horizon
372
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
373
374
375
376
377
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
378
            self.V[:, N] = h
379
        # Call the iteration method
380
        #self.run()
381
        
382
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
383
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
384
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
385
        # loop through each time period
386
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
387
            W, X = self._bellmanOperator(self.V[:, self.N - n])
Steven Cordwell's avatar
Steven Cordwell committed
388
389
390
            stage = self.N - n - 1
            self.V[:, stage] = X
            self.policy[:, stage] = W
Steven Cordwell's avatar
Steven Cordwell committed
391
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
392
393
                print(("stage: %s, policy: %s") % (
                    stage, self.policy[:, stage].tolist()))
Steven Cordwell's avatar
Steven Cordwell committed
394
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
395
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
396
397
        # After this we could create a tuple of tuples for the values and 
        # policies.
Steven Cordwell's avatar
Steven Cordwell committed
398
399
400
        #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
        #self.policy = tuple(tuple(self.policy[:, n].tolist())
        #                    for n in range(self.N))
Steven Cordwell's avatar
Steven Cordwell committed
401
402

class LP(MDP):
403
    
404
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
405
406
    
    This class requires the Python ``cvxopt`` module to be installed.
Steven Cordwell's avatar
Steven Cordwell committed
407
408
409

    Arguments
    ---------
410
411
412
413
414
415
416
417
418
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
419
420
    h : array, optional
        Terminal reward. Default: a vector of zeros.
Steven Cordwell's avatar
Steven Cordwell committed
421
    
Steven Cordwell's avatar
Steven Cordwell committed
422
423
424
425
426
427
428
429
    Data Attributes
    ---------------
    V : tuple
        optimal values
    policy : tuple
        optimal policy
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
430
431
432
    
    Examples
    --------
433
434
435
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
436
    >>> lp.run()
437
    
Steven Cordwell's avatar
Steven Cordwell committed
438
    """
Steven Cordwell's avatar
Steven Cordwell committed
439

Steven Cordwell's avatar
Steven Cordwell committed
440
    def __init__(self, transitions, reward, discount):
441
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
442
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
443
444
        try:
            from cvxopt import matrix, solvers
445
446
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
447
        except ImportError:
448
449
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
450
451
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
452
        from scipy.sparse import eye as speye
453
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
454
        # initialise the MDP. epsilon and max_iter are not needed
455
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
456
        # Set the cvxopt solver to be quiet by default, but ...
457
        # this doesn't do what I want it to do c.f. issue #3
458
459
        if not self.verbose:
            solvers.options['show_progress'] = False
460
        # Call the iteration method
461
        #self.run()
462
    
463
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
464
        #Run the linear programming algorithm.
465
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
466
        # The objective is to resolve : min V / V >= PR + discount*P*V
467
468
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
469
        # min f'* x / M * x <= b
470
471
472
473
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
474
475
476
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
477
478
        for aa in range(self.A):
            pos = (aa + 1) * self.S
479
480
481
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
482
483
484
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
485
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
486
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
487
        # apply the Bellman operator
488
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
489
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
490
        self.time = time() - self.time
491
        # store value and policy as tuples
492
493
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
494
495

class PolicyIteration(MDP):
496
    
497
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
498
    
Steven Cordwell's avatar
Steven Cordwell committed
499
500
    Arguments
    ---------
501
502
503
504
505
506
507
508
509
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details. 
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
510
511
512
    policy0 : array, optional
        Starting policy.
    max_iter : int, optional
513
514
        Maximum number of iterations. See the documentation for the ``MDP``
        class for details. Default is 1000.
Steven Cordwell's avatar
Steven Cordwell committed
515
516
517
518
    eval_type : int or string, optional
        Type of function used to evaluate policy. 0 or "matrix" to solve as a
        set of linear equations. 1 or "iterative" to solve iteratively.
        Default: 0.
Steven Cordwell's avatar
Steven Cordwell committed
519
             
Steven Cordwell's avatar
Steven Cordwell committed
520
521
522
523
524
525
526
527
528
529
    Data Attributes
    ---------------
    V : tuple
        value function 
    policy : tuple
        optimal policy
    iter : int
        number of done iterations
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
530
531
532
533
534
535
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
536
537
    Examples
    --------
538
539
540
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
541
    >>> pi.run()
542
    
543
544
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
545
    >>> pi.run()
Steven Cordwell's avatar
Steven Cordwell committed
546
    >>> pi.V
547
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
548
    >>> pi.policy
549
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
550
    """
Steven Cordwell's avatar
Steven Cordwell committed
551
    
552
553
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
554
555
556
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
557
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
558
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
559
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
560
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
561
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
562
563
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
564
            del null
Steven Cordwell's avatar
Steven Cordwell committed
565
        else:
Steven Cordwell's avatar
Steven Cordwell committed
566
567
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
568
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
569
            # Make sure the policy is the right size and shape
570
571
            assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'policy0' must a vector with length S."
Steven Cordwell's avatar
Steven Cordwell committed
572
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
573
            policy0 = policy0.reshape(self.S)
574
575
576
577
578
579
            # The policy can only contain integers between 0 and S-1
            msg = "'policy0' must be a vector of integers between 0 and S-1."
            assert not mod(policy0, 1).any(), msg
            assert (policy0 >= 0).all(), msg
            assert (policy0 < self.S).all(), msg
            self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
580
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
581
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
582
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
583
        if eval_type in (0, "matrix"):
584
            from numpy.linalg import solve
585
            from scipy.sparse import eye
586
587
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
588
589
590
591
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
Steven Cordwell's avatar
Steven Cordwell committed
592
593
594
            raise ValueError("'eval_type' should be '0' for matrix evaluation "
                             "or '1' for iterative evaluation. The strings "
                             "'matrix' and 'iterative' can also be used.")
595
        # Call the iteration method
596
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
597
    
598
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
618
619
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
620
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
621
622
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
623
624
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
625
626
627
628
                try:
                    Ppolicy[ind, :] = self.P[aa][ind, :]
                except ValueError:
                    Ppolicy[ind, :] = self.P[aa][ind, :].todense()
629
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
630
631
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
632
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
633
634
635
636
637
638
639
640
641
642
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
643
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
674
675
676
677
678
        try:
            assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'V0' must be a vector of length S."
            policy_V = array(V0).reshape(self.S)
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
679
            if V0 == 0:
680
                policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
681
682
            else:
                policy_V = array(V0).reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
683
        
684
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
685
686
        
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
687
            print('    Iteration\t\t    V variation')
688
        
Steven Cordwell's avatar
Steven Cordwell committed
689
690
691
        itr = 0
        done = False
        while not done:
692
            itr += 1
693
694
            
            Vprev = policy_V
695
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
696
697
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
698
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
699
                print(('      %s\t\t      %s') % (itr, variation))
700
701
702
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
703
704
                done = True
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
705
                    print(MSG_STOP_EPSILON_OPTIMAL_VALUE)
Steven Cordwell's avatar
Steven Cordwell committed
706
707
708
            elif itr == max_iter:
                done = True
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
709
                    print(MSG_STOP_MAX_ITER)
Steven Cordwell's avatar
Steven Cordwell committed
710
        
Steven Cordwell's avatar
Steven Cordwell committed
711
        self.V = policy_V
712
    
713
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
733
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
734
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
735
736
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
737
    
738
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
739
740
        # Run the policy iteration algorithm.
        # If verbose the print a header
741
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
742
            print('  Iteration\t\tNumber of different actions')
Steven Cordwell's avatar
Steven Cordwell committed
743
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
744
        done = False
745
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
746
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
747
        while not done:
748
            self.iter += 1
749
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
750
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
751
            if self.eval_type == "matrix":
752
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
753
            elif self.eval_type == "iterative":
754
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
755
756
            # This should update the classes policy attribute but leave the
            # value alone
757
            policy_next, null = self._bellmanOperator()
758
            del null
Steven Cordwell's avatar
Steven Cordwell committed
759
760
            # calculate in how many places does the old policy disagree with
            # the new policy
761
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
762
            # if verbose then continue printing a table
763
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
764
                print(('    %s\t\t  %s') % (self.iter, n_different))
Steven Cordwell's avatar
Steven Cordwell committed
765
766
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
767
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
768
                done = True
769
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
770
                    print(MSG_STOP_UNCHANGING_POLICY)
771
772
773
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
774
                    print(MSG_STOP_MAX_ITER)
775
776
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
777
        # update the time to return th computation time
778
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
779
        # store value and policy as tuples
780
781
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
782

783
class PolicyIterationModified(PolicyIteration):
784
    
785
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
786
787
788
    
    Arguments
    ---------
789
790
791
792
793
794
795
796
797
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.
Steven Cordwell's avatar
Steven Cordwell committed
798
799
800
801
    epsilon : float, optional
        Stopping criterion. See the documentation for the ``MDP`` class for
        details. Default: 0.01.
    max_iter : int, optional
802
        Maximum number of iterations. See the documentation for the ``MDP``
Steven Cordwell's avatar
Steven Cordwell committed
803
        class for details. Default is 10.
Steven Cordwell's avatar
Steven Cordwell committed
804
805
806
    
    Data Attributes
    ---------------
Steven Cordwell's avatar
Steven Cordwell committed
807
808
809
810
811
812
813
814
    V : tuple
        value function 
    policy : tuple
        optimal policy
    iter : int
        number of done iterations
    time : float
        used CPU time
Steven Cordwell's avatar
Steven Cordwell committed
815
816
817
    
    Examples
    --------
818
819
820
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
821
    >>> pim.run()
822
    >>> pim.policy
823
    (0, 0, 0)
824
    >>> pim.V
825
    (21.81408652334702, 25.054086523347017, 29.054086523347017)
826
    
Steven Cordwell's avatar
Steven Cordwell committed
827
    """
828
    
829
830
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
831
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
832
        
833
834
835
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
836
        # is needed from the PolicyIteration class is the _evalPolicyIterative
837
        # function. Perhaps there is a better way to do it?
838
839
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
840
        
841
842
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
843
844
        self.epsilon = float(epsilon)
        assert epsilon > 0, "'epsilon' must be greater than 0."
845
        
846
847
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
848
        if self.discount != 1:
849
            self.thresh = self.epsilon * (1 - self.discount) / self.discount
850
        else:
851
            self.thresh = self.epsilon
852
        
853
        if self.discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
854
            self.V = zeros((self.S, 1))
855
        else:
856
            Rmin = min(R.min() for R in self.R)
857
            self.V = 1 / (1 - self.discount) * Rmin * ones((self.S,))
858
859
        
        # Call the iteration method
860
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
861
    
862
    def run(self):
863
        # Run the modified policy iteration algorithm.
864
865
        
        if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
866
            print('  \tIteration\t\tV-variation')
867
        
Steven Cordwell's avatar
Steven Cordwell committed
868
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
869
        
870
871
        done = False
        while not done:
872
            self.iter += 1
873
            
874
            self.policy, Vnext = self._bellmanOperator()
875
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
876
            
877
            variation = getSpan(Vnext - self.V)
878
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
879
                print(("    %s\t\t  %s" % (self.iter, variation)))
880
            
Steven Cordwell's avatar
Steven Cordwell committed
881
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
882
            if variation < self.thresh:
883
884
885
886
                done = True
            else:
                is_verbose = False
                if self.verbose:
887
                    self.setSilent()
888
889
                    is_verbose = True
                
890
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
891
892
                
                if is_verbose:
893
                    self.setVerbose()
894
        
895
        self.time = time() - self.time
896
897
        
        # store value and policy as tuples
898
899
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
900
901

class QLearning(MDP):
902
    
903
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
904
905
906
    
    Parameters
    ----------
907
908
909
910
911
912
913
914
915
    transitions : array
        Transition probability matrices. See the documentation for the ``MDP``
        class for details.
    reward : array
        Reward matrices or vectors. See the documentation for the ``MDP`` class
        for details.
    discount : float
        Discount factor. See the documentation for the ``MDP`` class for
        details.