mdp.py 72.9 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
Steven Cordwell's avatar
Steven Cordwell committed
2
"""Markov Decision Process (MDP) Toolbox
3
=====================================
4

Steven Cordwell's avatar
Steven Cordwell committed
5
6
The MDP toolbox provides classes and functions for the resolution of
descrete-time Markov Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28

Steven Cordwell's avatar
Steven Cordwell committed
29
30
31
32
33
34
35
36
37
38
Available functions
-------------------
check
    Check that an MDP is properly defined
checkSquareStochastic
    Check that a matrix is square and stochastic
exampleForest
    A simple forest management example
exampleRand
    A random example
Steven Cordwell's avatar
Steven Cordwell committed
39

Steven Cordwell's avatar
Steven Cordwell committed
40
41
42
43
44
45
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from 
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
examples assume that the `mdp` module has been imported::
Steven Cordwell's avatar
Steven Cordwell committed
46

Steven Cordwell's avatar
Steven Cordwell committed
47
48
49
50
51
52
  >>> import mdp

Code snippets are indicated by three greater-than signs::

  >>> x = 17
  >>> x = x + 1
53
54
  >>> x
  18
Steven Cordwell's avatar
Steven Cordwell committed
55
56
57
58
59

The documentation can be displayed with
`IPython <http://ipython.scipy.org>`_. For example, to view the docstring of
the ValueIteration class use ``mdp.ValueIteration?<ENTER>``, and to view its
source code use ``mdp.ValueIteration??<ENTER>``.
60

61
62
63
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at 
Steven Cordwell's avatar
Steven Cordwell committed
64
http://www.inra.fr/mia/T/MDPtoolbox/.
65

Steven Cordwell's avatar
Steven Cordwell committed
66
67
"""

68
69
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
97
from math import ceil, log, sqrt
98
from random import random
Steven Cordwell's avatar
Steven Cordwell committed
99
100
from time import time

101
from numpy import absolute, array, diag, empty, mean, mod, multiply
102
from numpy import ndarray, ones, where, zeros
103
from numpy.random import rand, randint
Steven Cordwell's avatar
Steven Cordwell committed
104
from scipy.sparse import csr_matrix as sparse
105
from scipy.sparse import coo_matrix, dok_matrix
Steven Cordwell's avatar
Steven Cordwell committed
106

107
108
# __all__ = ["check",  "checkSquareStochastic"]

Steven Cordwell's avatar
Steven Cordwell committed
109
# These need to be fixed so that we use classes derived from Error.
Steven Cordwell's avatar
Steven Cordwell committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
130
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
131
132
133
134
135
136
137
138
139
140
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
141
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
142
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
143
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
144
145
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
146
147
    "(S, A) with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
148
149
150
151
152
153
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
154
155
156
157
158
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
159
160
}

Steven Cordwell's avatar
Steven Cordwell committed
161
def check(P, R):
Steven Cordwell's avatar
Steven Cordwell committed
162
    """Check if P and R define a Markov Decision Process.
Steven Cordwell's avatar
Steven Cordwell committed
163
164
165
166
167
    
    Let S = number of states, A = number of actions.
    
    Parameters
    ---------
168
169
170
    P : array
        The transition matrices. It can be a three dimensional array with
        a shape of (A, S, S). It can also be a one dimensional arraye with
Steven Cordwell's avatar
Steven Cordwell committed
171
172
        a shape of (A, ), where each element contains a matrix of shape (S, S)
        which can possibly be sparse.
173
174
175
    R : array
        The reward matrix. It can be a three dimensional array with a
        shape of (S, A, A). It can also be a one dimensional array with a
Steven Cordwell's avatar
Steven Cordwell committed
176
        shape of (A, ), where each element contains matrix with a shape of
177
        (S, S) which can possibly be sparse. It can also be an array with
Steven Cordwell's avatar
Steven Cordwell committed
178
        a shape of (S, A) which can possibly be sparse.  
179
    
Steven Cordwell's avatar
Steven Cordwell committed
180
181
182
183
    Notes
    -----
    Raises an error if P and R do not define a MDP.

Steven Cordwell's avatar
Steven Cordwell committed
184
    """
185
186
187
188
    # Checking P
    try:
        if P.ndim == 3:
            aP, sP0, sP1 = P.shape
189
        elif P.ndim == 1:
Steven Cordwell's avatar
Steven Cordwell committed
190
191
            # A hack so that we can go into the next try-except statement and
            # continue checking from there
192
            raise AttributeError
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
        else:
            raise ValueError(mdperr["P_shape"])
    except AttributeError:
        try:
            aP = len(P)
            sP0, sP1 = P[0].shape
            for aa in xrange(1, aP):
                sP0aa, sP1aa = P[aa].shape
                if (sP0aa != sP0) or (sP1aa != sP1):
                    raise ValueError(mdperr["obj_square"])
        except AttributeError:
            raise TypeError(mdperr["P_shape"])
    except:
        raise
    # Checking R
    try:
        if R.ndim == 2:
            sR0, aR = R.shape
            sR1 = sR0
        elif R.ndim == 3:
            aR, sR0, sR1 = R.shape
214
215
216
        elif R.ndim == 1:
            # A hack so that we can go into the next try-except statement
            raise AttributeError
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
        else:
            raise ValueError(mdperr["R_shape"])
    except AttributeError:
        try:
            aR = len(R)
            sR0, sR1 = R[0].shape
            for aa in range(1, aR):
                sR0aa, sR1aa = R[aa].shape
                if ((sR0aa != sR0) or (sR1aa != sR1)):
                    raise ValueError(mdperr["obj_square"])
        except AttributeError:
            raise ValueError(mdperr["R_shape"])
    except:
        raise
    # Checking dimensions
    if (sP0 < 1) or (aP < 1) or (sP0 != sP1):
        raise ValueError(mdperr["P_shape"])   
    if (sR0 < 1) or (aR < 1) or (sR0 != sR1):
        raise ValueError(mdperr["R_shape"])
    if (sP0 != sR0) or (aP != aR):
        raise ValueError(mdperr["PR_incompat"])
    # Check that the P's are square and stochastic
    for aa in xrange(aP):
        checkSquareStochastic(P[aa])
        #checkSquareStochastic(P[aa, :, :])
    # We are at the end of the checks, so if no exceptions have been raised
    # then that means there are (hopefullly) no errors and we return None
    return None
    
    # These are the old code comments, which need to be converted to
    # information in the docstring:
    #
Steven Cordwell's avatar
Steven Cordwell committed
249
250
251
252
253
    # tranitions must be a numpy array either an AxSxS ndarray (with any 
    # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, 
    # and each element containing an SxS array. An AxSxS array will be
    # be converted to an object array. A numpy object array is similar to a
    # MATLAB cell array.
254
    #
Steven Cordwell's avatar
Steven Cordwell committed
255
256
257
258
259
260
261
262
263
264
    # NumPy has an array type of 'object', which is roughly equivalent to
    # the MATLAB cell array. These are most useful for storing sparse
    # matrices as these can only have two dimensions whereas we want to be
    # able to store a transition matrix for each action. If the dytpe of
    # the transition probability array is object then we store this as
    # P_is_object = True.
    # If it is an object array, then it should only have one dimension
    # otherwise fail with a message expalining why.
    # If it is a normal array then the number of dimensions must be exactly
    # three, otherwise fail with a message explaining why.
265
    #
Steven Cordwell's avatar
Steven Cordwell committed
266
267
    # As above but for the reward array. A difference is that the reward
    # array can have either two or 3 dimensions.
268
    #
Steven Cordwell's avatar
Steven Cordwell committed
269
270
271
272
273
    # We want to make sure that the transition probability array and the 
    # reward array are in agreement. This means that both should show that
    # there are the same number of actions and the same number of states.
    # Furthermore the probability of transition matrices must be SxS in
    # shape, so we check for that also.
274
    #
Steven Cordwell's avatar
Steven Cordwell committed
275
276
277
278
279
280
281
282
283
        # If the user has put their transition matrices into a numpy array
        # with dtype of 'object', then it is possible that they have made a
        # mistake and not all of the matrices are of the same shape. So,
        # here we record the number of actions and states that the first
        # matrix in element zero of the object array says it has. After
        # that we check that every other matrix also reports the same
        # number of actions and states, otherwise fail with an error.
        # aP: the number of actions in the transition array. This
        # corresponds to the number of elements in the object array.
284
        #
Steven Cordwell's avatar
Steven Cordwell committed
285
286
287
288
        # sP0: the number of states as reported by the number of rows of
        # the transition matrix
        # sP1: the number of states as reported by the number of columns of
        # the transition matrix
289
        #
Steven Cordwell's avatar
Steven Cordwell committed
290
291
        # Now we check to see that every element of the object array holds
        # a matrix of the same shape, otherwise fail.
292
        #
Steven Cordwell's avatar
Steven Cordwell committed
293
294
295
296
            # sp0aa and sp1aa represents the number of states in each
            # subsequent element of the object array. If it doesn't match
            # what was found in the first element, then we need to fail
            # telling the user what needs to be fixed.
297
            #
Steven Cordwell's avatar
Steven Cordwell committed
298
299
300
        # if we are using a normal array for this, then the first
        # dimension should be the number of actions, and the second and 
        # third should be the number of states
301
        #
Steven Cordwell's avatar
Steven Cordwell committed
302
303
304
305
306
307
    # the first dimension of the transition matrix must report the same
    # number of states as the second dimension. If not then we are not
    # dealing with a square matrix and it is not a valid transition
    # probability. Also, if the number of actions is less than one, or the
    # number of states is less than one, then it also is not a valid
    # transition probability.
308
    #
Steven Cordwell's avatar
Steven Cordwell committed
309
310
311
    # now we check that each transition matrix is square-stochastic. For
    # object arrays this is the matrix held in each element, but for
    # normal arrays this is a matrix formed by taking a slice of the array
312
    #
Steven Cordwell's avatar
Steven Cordwell committed
313
314
315
        # if the rewarad array has an object dtype, then we check that
        # each element contains a matrix of the same shape as we did 
        # above with the transition array.
316
        #
Steven Cordwell's avatar
Steven Cordwell committed
317
318
319
        # This indicates that the reward matrices are constructed per 
        # transition, so that the first dimension is the actions and
        # the second two dimensions are the states.
320
        #
Steven Cordwell's avatar
Steven Cordwell committed
321
322
        # then the reward matrix is per state, so the first dimension is 
        # the states and the second dimension is the actions.
323
        #
Steven Cordwell's avatar
Steven Cordwell committed
324
325
        # this is added just so that the next check doesn't error out
        # saying that sR1 doesn't exist
326
        #
Steven Cordwell's avatar
Steven Cordwell committed
327
328
    # the number of actions must be more than zero, the number of states
    # must also be more than 0, and the states must agree
329
    #
Steven Cordwell's avatar
Steven Cordwell committed
330
331
332
333
334
    # now we check to see that what the transition array is reporting and
    # what the reward arrar is reporting agree as to the number of actions
    # and states. If not then fail explaining the situation

def checkSquareStochastic(Z):
335
    """Check if Z is a square stochastic matrix.
Steven Cordwell's avatar
Steven Cordwell committed
336
    
337
338
    Let S = number of states.
    
Steven Cordwell's avatar
Steven Cordwell committed
339
340
    Parameters
    ----------
341
    Z : matrix
342
343
        This should be a two dimensional array with a shape of (S, S). It can
        possibly be sparse.
Steven Cordwell's avatar
Steven Cordwell committed
344
    
345
    Notes 
Steven Cordwell's avatar
Steven Cordwell committed
346
    ----------
347
    Returns None if no error has been detected, else it raises an error.
348
    
Steven Cordwell's avatar
Steven Cordwell committed
349
    """
Steven Cordwell's avatar
Steven Cordwell committed
350
351
352
353
354
355
356
357
    # try to get the shape of the matrix
    try:
        s1, s2 = Z.shape
    except AttributeError:
        raise TypeError("Matrix should be a numpy type.")
    except ValueError:
        raise ValueError(mdperr["mat_square"])
    # check that the matrix is square, and that each row sums to one
358
    if s1 != s2:
Steven Cordwell's avatar
Steven Cordwell committed
359
        raise ValueError(mdperr["mat_square"])
360
    elif (absolute(Z.sum(axis=1) - ones(s2))).max() > 10e-12:
Steven Cordwell's avatar
Steven Cordwell committed
361
        raise ValueError(mdperr["mat_stoch"])
Steven Cordwell's avatar
Steven Cordwell committed
362
363
364
365
366
367
368
369
370
371
    # make sure that there are no values less than zero
    try:
        if (Z < 0).any():
            raise ValueError(mdperr["mat_nonneg"])
    except AttributeError:
        try:
            if (Z.data < 0).any():
                raise ValueError(mdperr["mat_nonneg"])
        except AttributeError:
            raise TypeError("Matrix should be a numpy type.")
Steven Cordwell's avatar
Steven Cordwell committed
372
    except:
Steven Cordwell's avatar
Steven Cordwell committed
373
374
375
        raise
    
    return(None)
Steven Cordwell's avatar
Steven Cordwell committed
376

377
def exampleForest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
378
    """Generate a MDP example based on a simple forest management scenario.
Steven Cordwell's avatar
Steven Cordwell committed
379
    
380
381
382
383
384
385
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
386
    
Steven Cordwell's avatar
Steven Cordwell committed
387
    Here is how the problem is modelled.
388
389
390
    Let {1, 2 . . . ``S`` } be the states of the forest, with ``S`` being the 
    oldest. Let 'Wait' be action 1 and 'Cut' action 2.
    After a fire, the forest is in the youngest state, that is state 1.
Steven Cordwell's avatar
Steven Cordwell committed
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
    The transition matrix P of the problem can then be defined as follows::
        
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
        P[1,:,:] = | .  .  0  .       |
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
        
                   | 1 0..........0 |
                   | . .          . |
        P[2,:,:] = | . .          . |
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
    
    The reward matrix R is defined as follows::
        
                 |  0  |
                 |  .  |
        R[:,1] = |  .  |
                 |  .  |
                 |  0  |
                 |  r1 |
        
                 |  0  |
                 |  1  |
        R[:,2] = |  .  |
                 |  .  |
                 |  1  |
                 |  r2 |
Steven Cordwell's avatar
Steven Cordwell committed
422
423
424
    
    Parameters
    ---------
425
426
427
428
429
430
431
432
433
434
435
436
    S : int, optional
        The number of states, which should be an integer greater than 0. By
        default it is 3.
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
        performed. By default it is 4.
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
        performed. By default it is 2.
    p : float, optional
        The probability of wild fire occurence, in the range ]0, 1[. By default
        it is 0.1.
Steven Cordwell's avatar
Steven Cordwell committed
437
    
438
439
440
441
442
443
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
Steven Cordwell's avatar
Steven Cordwell committed
444
445
446
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
447
    >>> import mdp
Steven Cordwell's avatar
Steven Cordwell committed
448
449
450
451
452
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
Steven Cordwell's avatar
Steven Cordwell committed
453
    <BLANKLINE>
Steven Cordwell's avatar
Steven Cordwell committed
454
455
456
457
458
459
460
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
461
    
Steven Cordwell's avatar
Steven Cordwell committed
462
    """
463
    if S <= 1:
Steven Cordwell's avatar
Steven Cordwell committed
464
465
466
        raise ValueError(mdperr["S_gt_1"])
    if (r1 <= 0) or (r2 <= 0):
        raise ValueError(mdperr["R_gt_0"])
467
    if (p < 0) or (p > 1):
Steven Cordwell's avatar
Steven Cordwell committed
468
        raise ValueError(mdperr["prob_in01"])
469
470
    # Definition of Transition matrix P(:,:,1) associated to action Wait
    # (action 1) and P(:,:,2) associated to action Cut (action 2)
Steven Cordwell's avatar
Steven Cordwell committed
471
472
473
474
475
476
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
477
    if is_sparse:
478
479
480
        P = []
        rows = range(S) * 2
        cols = [0] * S + range(1, S) + [S - 1]
481
        vals = [p] * S + [1-p] * S
482
483
484
485
486
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
        rows = range(S)
        cols = [0] * S
        vals = [1] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
487
488
489
490
491
492
493
    else:
        P = zeros((2, S, S))
        P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
        P[1, :, :] = zeros((S, S))
        P[1, :, 0] = 1
Steven Cordwell's avatar
Steven Cordwell committed
494
495
496
497
498
499
500
501
502
503
504
505
506
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
507
    # we want to return the generated transition and reward matrices
Steven Cordwell's avatar
Steven Cordwell committed
508
509
510
    return (P, R)

def exampleRand(S, A, is_sparse=False, mask=None):
511
    """Generate a random Markov Decision Process.
Steven Cordwell's avatar
Steven Cordwell committed
512
513
514
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
515
516
517
518
519
    S : int
        number of states (> 0)
    A : int
        number of actions (> 0)
    is_sparse : logical, optional
520
        false to have matrices in dense format, true to have sparse
Steven Cordwell's avatar
Steven Cordwell committed
521
522
523
524
        matrices (default false).
    mask : array or None, optional
        matrix with 0 and 1 (0 indicates a place for a zero
        probability), (SxS) (default, random)
Steven Cordwell's avatar
Steven Cordwell committed
525
526
    
    Returns
Steven Cordwell's avatar
Steven Cordwell committed
527
528
529
530
531
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
Steven Cordwell's avatar
Steven Cordwell committed
532
533
534
535
536

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
537
    
Steven Cordwell's avatar
Steven Cordwell committed
538
    """
539
    # making sure the states and actions are more than one
540
    if (S < 1) or (A < 1):
Steven Cordwell's avatar
Steven Cordwell committed
541
        raise ValueError(mdperr["SA_gt_1"])
542
    # if the user hasn't specified a mask, then we will make a random one now
543
    if mask is not None:
544
545
546
547
548
549
        # the mask needs to be SxS or AxSxS
        try:
            if mask.shape not in ((S, S), (A, S, S)):
                raise ValueError(mdperr["mask_SbyS"])
        except AttributeError:
            raise TypeError(mdperr["mask_numpy"])
550
    # generate the transition and reward matrices based on S, A and mask
Steven Cordwell's avatar
Steven Cordwell committed
551
552
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
553
        P = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
554
        # definition of reward matrix (values between -1 and +1)
555
556
557
558
559
560
561
562
        R = [None] * A
        for a in xrange(A):
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
            PP = dok_matrix((S, S))
            RR = dok_matrix((S, S))
            for s in xrange(S):
563
564
565
566
567
568
569
570
571
                if mask is None:
                    m = rand(S)
                    m[m <= 2/3.0] = 0
                    m[m > 2/3.0] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                n = int(m.sum()) # m[s, :]
572
573
574
575
                if n == 0:
                    PP[s, randint(0, S)] = 1
                else:
                    rows = s * ones(n, dtype=int)
576
                    cols = where(m)[0] # m[s, :]
577
578
579
                    vals = rand(n)
                    vals = vals / vals.sum()
                    reward = 2*rand(n) - ones(n)
580
581
582
                    # I want to do this: PP[rows, cols] = vals, but it doesn't
                    # seem to work, as val[-1] is stored as the value for each
                    # row, column pair. Therefore the loop is needed.
583
584
                    for x in xrange(n):
                        PP[rows[x], cols[x]] = vals[x]
Steven Cordwell's avatar
typo    
Steven Cordwell committed
585
                        RR[rows[x], cols[x]] = reward[x]
586
587
588
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
589
590
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
Steven Cordwell's avatar
Steven Cordwell committed
591
592
593
594
595
596
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
            for s in range(S):
                # create our own random mask if there is no user supplied one
                if mask is None:
                    m = rand(S)
                    r = random()
                    m[m <= r] = 0
                    m[m > r] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                # Make sure that there is atleast one transition in each state
                if m.sum() == 0:
                    P[a, s, randint(0, S)] = 1
                else:
                    P[a][s] = m * rand(S)
                    P[a][s] = P[a][s] / P[a][s].sum()
                R[a][s] = (m * (2*rand(S) - ones(S, dtype=int)))
615
    # we want to return the generated transition and reward matrices
Steven Cordwell's avatar
Steven Cordwell committed
616
617
    return (P, R)

Steven Cordwell's avatar
Steven Cordwell committed
618
def getSpan(W):
619
    """Return the span of W
620
621
    
    sp(W) = max W(s) - min W(s)
622
    
623
624
625
    """
    return (W.max() - W.min())

626

Steven Cordwell's avatar
Steven Cordwell committed
627
class MDP(object):
628
    
Steven Cordwell's avatar
Steven Cordwell committed
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
673
    
674
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
675
        """Initialise a MDP based on the input parameters."""
676
        
Steven Cordwell's avatar
Steven Cordwell committed
677
678
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
679
        if type(discount) in (int, float):
Steven Cordwell's avatar
Steven Cordwell committed
680
681
682
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
Steven Cordwell's avatar
Steven Cordwell committed
683
                if discount == 1:
684
685
686
                    print("PyMDPtoolbox WARNING: check conditions of "
                          "convergence. With no discount, convergence is not "
                          "always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
687
                self.discount = discount
688
        elif discount is not None:
689
690
            raise ValueError("PyMDPtoolbox: the discount must be a positive "
                             "real number less than or equal to one.")
691
        
Steven Cordwell's avatar
Steven Cordwell committed
692
693
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
694
695
        if type(max_iter) in (int, float):
            if max_iter <= 0:
Steven Cordwell's avatar
Steven Cordwell committed
696
697
698
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
699
        elif max_iter is not None:
700
701
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "
                             "number greater than zero.")
702
        
703
704
        if type(epsilon) in (int, float):
            if epsilon <= 0:
705
706
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
707
        elif epsilon is not None:
708
709
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
710
        
Steven Cordwell's avatar
Steven Cordwell committed
711
712
713
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
714
        
Steven Cordwell's avatar
Steven Cordwell committed
715
        # computePR will assign the variables self.S, self.A, self.P and self.R
716
        self._computePR(transitions, reward)
717
        
Steven Cordwell's avatar
Steven Cordwell committed
718
719
720
721
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
722
723
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
724
        
Steven Cordwell's avatar
Steven Cordwell committed
725
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
726
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
727
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
728
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
729
    
730
    def _bellmanOperator(self, V=None):
731
        """Apply the Bellman operator on the value function.
Steven Cordwell's avatar
Steven Cordwell committed
732
        
Steven Cordwell's avatar
Steven Cordwell committed
733
734
        Updates the value function and the Vprev-improving policy.
        
735
        Returns
Steven Cordwell's avatar
Steven Cordwell committed
736
        -------
737
        (policy, value) : tuple of new policy and its value
738
        
Steven Cordwell's avatar
Steven Cordwell committed
739
        """
740
741
        if V is None:
            # this V should be a reference to the data rather than a copy
742
743
            V = self.V
        else:
744
            try:
Steven Cordwell's avatar
Steven Cordwell committed
745
                if V.shape not in ((self.S,), (1, self.S)):
746
                    raise ValueError("bellman: V is not the right shape.")
747
748
            except AttributeError:
                raise TypeError("bellman: V must be a numpy array or matrix.")
749
        
Steven Cordwell's avatar
Steven Cordwell committed
750
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
751
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
752
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
753
        
754
        # Which way is better?
755
        # 1. Return, (policy, value)
756
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
757
758
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
759
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
760
    
761
    def _computePR(self, P, R):
762
        """Compute the reward for the system in one state chosing an action.
Steven Cordwell's avatar
Steven Cordwell committed
763
        
Steven Cordwell's avatar
Steven Cordwell committed
764
        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
765
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
766
767
768
769
770
771
772
773
        Let S = number of states, A = number of actions
            P(SxSxA)  = transition matrix 
                P could be an array with 3 dimensions or  a cell array (1xA), 
                each cell containing a matrix (SxS) possibly sparse
            R(SxSxA) or (SxA) = reward matrix
                R could be an array with 3 dimensions (SxSxA) or  a cell array 
                (1xA), each cell containing a sparse matrix (SxS) or a 2D 
                array(SxA) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
774
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
775
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
776
            PR(SxA)   = reward matrix
777
        
Steven Cordwell's avatar
Steven Cordwell committed
778
        """
779
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
780
        # check(P, R) has already been run and doesn't fail.
781
        #
782
783
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
Steven Cordwell's avatar
Steven Cordwell committed
784
        self.A = len(P)
785
786
787
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
Steven Cordwell's avatar
Steven Cordwell committed
788
789
            else:
               self.S = P[0].shape[0]
790
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
791
            self.S = P[0].shape[0]
792
793
794
795
796
        except:
            raise
        # convert Ps to matrices
        self.P = []
        for aa in xrange(self.A):
797
            self.P.append(P[aa])
798
        self.P = tuple(self.P)
Steven Cordwell's avatar
Steven Cordwell committed
799
800
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
801
        try:
802
            if R.ndim == 2:
803
804
                self.R = []
                for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
805
                    self.R.append(array(R[:, aa]).reshape(self.S))
Steven Cordwell's avatar
Steven Cordwell committed
806
            else:
807
808
809
810
                raise AttributeError
        except AttributeError:
            self.R = []
            for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
811
812
813
814
815
816
                try:
                    self.R.append(P[aa].multiply(R[aa]).sum(1).reshape(self.S))
                except AttributeError:
                    self.R.append(multiply(P[aa],R[aa]).sum(1).reshape(self.S))
                except:
                    raise
817
818
819
        except:
            raise
        self.R = tuple(self.R)
820
    
821
    def _iterate(self):
822
        """Raise error because child classes should implement this function."""
823
        raise NotImplementedError("You should create an _iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
824
    
Steven Cordwell's avatar
Steven Cordwell committed
825
    def setSilent(self):
826
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
827
828
829
        self.verbose = False
    
    def setVerbose(self):
830
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
831
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
832
833

class FiniteHorizon(MDP):
834
    
Steven Cordwell's avatar
Steven Cordwell committed
835
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
836
837
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
838
839
840
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
841
    P(SxSxA) = transition matrix 
842
843
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
844
845
846
847
848
849
850
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
851
852
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
853
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
854
855
856
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
857
858
859
860
861
862
863
864
865
866
867
868
869
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
870
    
Steven Cordwell's avatar
Steven Cordwell committed
871
872
873
874
875
876
877
878
879
880
881
882
883
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> fh = mdp.FiniteHorizon(P, R, 0.9, 3)
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
Steven Cordwell's avatar
Steven Cordwell committed
884
    """
Steven Cordwell's avatar
Steven Cordwell committed
885

Steven Cordwell's avatar
Steven Cordwell committed
886
    def __init__(self, transitions, reward, discount, N, h=None):
887
        """Initialise a finite horizon MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
888
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
889
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
890
        else:
Steven Cordwell's avatar
Steven Cordwell committed
891
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
892
        # Initialise the base class
893
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
894
895
        # remove the iteration counter, it is not meaningful for backwards
        # induction
896
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
897
        # There are value vectors for each time step up to the horizon
898
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
899
900
901
902
903
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
904
            self.V[:, N] = h
Steven Cordwell's avatar
Steven Cordwell committed
905
        
906
907
908
909
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
910
        """Run the finite horizon algorithm."""
Steven Cordwell's avatar
Steven Cordwell committed
911
912
        self.time = time()
        
913
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
914
915
916
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
917
            if self.verbose:
918
919
                print("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
920
921
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
922
923
924
925
926
927
928
929
930
931
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
932
933

class LP(MDP):
934
    
935
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
936
937
938
939
940

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
941
942
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
962
    
Steven Cordwell's avatar
Steven Cordwell committed
963
    """
Steven Cordwell's avatar
Steven Cordwell committed
964

Steven Cordwell's avatar
Steven Cordwell committed
965
    def __init__(self, transitions, reward, discount):
966
        """Initialise a linear programming MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
967
968
969
        
        try:
            from cvxopt import matrix, solvers
970
971
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
972
        except ImportError:
973
974
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
975
        
Steven Cordwell's avatar
Steven Cordwell committed
976
        from scipy.sparse import eye as speye
977
        self._speye = speye
978
979
        
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
980
        
981
        # this doesn't do what I want it to do c.f. issue #3
982
983
        if not self.verbose:
            solvers.options['show_progress'] = False
984
985
986
        
        # Call the iteration method
        self._iterate()
987
    
988
    def _iterate(self):
989
990
        """Run the linear programming algorithm."""
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
991
        # The objective is to resolve : min V / V >= PR + discount*P*V
992
993
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
994
        # min f'* x / M * x <= b
995
996
997
998
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
999
1000
1001
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
1002
1003
        for aa in range(self.A):
            pos = (aa + 1) * self.S
1004
1005
1006
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
1007
1008
1009
1010
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
        # only to 10e-8 places.
Steven Cordwell's avatar
Steven Cordwell committed
1011
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
1012
        
1013
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
1014
1015
        
        self.time = time() - self.time
1016
1017
        
        # store value and policy as tuples
1018
1019
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
1020
1021

class PolicyIteration(MDP):
1022
    
1023
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
1024
    
Steven Cordwell's avatar
Steven Cordwell committed
1025
1026
1027
1028
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
1029
1030
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
1055
1056
1057
1058
1059
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    >>> pi = mdp.PolicyIteration(P, R, 0.9)
1060
    
Steven Cordwell's avatar
Steven Cordwell committed
1061
    """
Steven Cordwell's avatar
Steven Cordwell committed
1062
    
1063
1064
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
1065
        """Initialise a policy iteration MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
1066
        
1067
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
1068
        
Steven Cordwell's avatar
Steven Cordwell committed
1069
1070
1071
        if policy0 == None:
            # initialise the policy to the one which maximises the expected
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
1072
            self.V = zeros(self.S)
1073
            self.policy, null = self._bellmanOperator()
1074
            del null
Steven Cordwell's avatar
Steven Cordwell committed
1075
1076
1077
1078
        else:
            policy0 = array(policy0)
            
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
1079
1080
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
1081
            
Steven Cordwell's avatar
Steven Cordwell committed
1082
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1083
            
1084
1085
1086
1087
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
1088
1089
1090
1091
            else:
                self.policy = policy0
        
        # set or reset the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
1092
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1093
        
Steven Cordwell's avatar
Steven Cordwell committed
1094
        if eval_type in (0, "matrix"):
1095
            from numpy.linalg import solve
1096
            from scipy.sparse import eye
1097
1098
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
1099
1100
1101
1102
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
1103
1104
1105
1106
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
1107
1108
1109
        
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
1110
    
1111
    def _computePpolicyPRpolicy(self):