mdp.py 72.8 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
Steven Cordwell's avatar
Steven Cordwell committed
2
"""Markov Decision Process (MDP) Toolbox
3
=====================================
4

Steven Cordwell's avatar
Steven Cordwell committed
5
6
The MDP toolbox provides classes and functions for the resolution of
descrete-time Markov Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28

Steven Cordwell's avatar
Steven Cordwell committed
29
30
31
32
33
34
35
36
37
38
Available functions
-------------------
check
    Check that an MDP is properly defined
checkSquareStochastic
    Check that a matrix is square and stochastic
exampleForest
    A simple forest management example
exampleRand
    A random example
Steven Cordwell's avatar
Steven Cordwell committed
39

Steven Cordwell's avatar
Steven Cordwell committed
40
41
42
43
44
45
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from 
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
examples assume that the `mdp` module has been imported::
Steven Cordwell's avatar
Steven Cordwell committed
46

Steven Cordwell's avatar
Steven Cordwell committed
47
48
49
50
51
52
  >>> import mdp

Code snippets are indicated by three greater-than signs::

  >>> x = 17
  >>> x = x + 1
53
54
  >>> x
  18
Steven Cordwell's avatar
Steven Cordwell committed
55
56
57
58
59

The documentation can be displayed with
`IPython <http://ipython.scipy.org>`_. For example, to view the docstring of
the ValueIteration class use ``mdp.ValueIteration?<ENTER>``, and to view its
source code use ``mdp.ValueIteration??<ENTER>``.
60

61
62
63
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at 
Steven Cordwell's avatar
Steven Cordwell committed
64
http://www.inra.fr/mia/T/MDPtoolbox/.
65

Steven Cordwell's avatar
Steven Cordwell committed
66
67
"""

68
69
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
97
from math import ceil, log, sqrt
98
from random import random
Steven Cordwell's avatar
Steven Cordwell committed
99
100
from time import time

101
from numpy import absolute, array, diag, empty, mean, mod, multiply
102
from numpy import ndarray, ones, where, zeros
103
from numpy.random import rand, randint
Steven Cordwell's avatar
Steven Cordwell committed
104
from scipy.sparse import csr_matrix as sparse
105
from scipy.sparse import coo_matrix, dok_matrix
Steven Cordwell's avatar
Steven Cordwell committed
106

107
108
# __all__ = ["check",  "checkSquareStochastic"]

Steven Cordwell's avatar
Steven Cordwell committed
109
# These need to be fixed so that we use classes derived from Error.
Steven Cordwell's avatar
Steven Cordwell committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
130
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
131
132
133
134
135
136
137
138
139
140
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
141
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
142
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
143
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
144
145
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
146
147
    "(S, A) with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
148
149
150
151
152
153
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
154
155
156
157
158
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
159
160
}

Steven Cordwell's avatar
Steven Cordwell committed
161
def check(P, R):
Steven Cordwell's avatar
Steven Cordwell committed
162
    """Check if P and R define a Markov Decision Process.
Steven Cordwell's avatar
Steven Cordwell committed
163
164
165
166
167
    
    Let S = number of states, A = number of actions.
    
    Parameters
    ---------
168
169
170
    P : array
        The transition matrices. It can be a three dimensional array with
        a shape of (A, S, S). It can also be a one dimensional arraye with
Steven Cordwell's avatar
Steven Cordwell committed
171
172
        a shape of (A, ), where each element contains a matrix of shape (S, S)
        which can possibly be sparse.
173
174
175
    R : array
        The reward matrix. It can be a three dimensional array with a
        shape of (S, A, A). It can also be a one dimensional array with a
Steven Cordwell's avatar
Steven Cordwell committed
176
        shape of (A, ), where each element contains matrix with a shape of
177
        (S, S) which can possibly be sparse. It can also be an array with
Steven Cordwell's avatar
Steven Cordwell committed
178
        a shape of (S, A) which can possibly be sparse.  
179
    
Steven Cordwell's avatar
Steven Cordwell committed
180
181
182
183
    Notes
    -----
    Raises an error if P and R do not define a MDP.

Steven Cordwell's avatar
Steven Cordwell committed
184
    """
185
186
187
188
    # Checking P
    try:
        if P.ndim == 3:
            aP, sP0, sP1 = P.shape
189
        elif P.ndim == 1:
Steven Cordwell's avatar
Steven Cordwell committed
190
191
            # A hack so that we can go into the next try-except statement and
            # continue checking from there
192
            raise AttributeError
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
        else:
            raise ValueError(mdperr["P_shape"])
    except AttributeError:
        try:
            aP = len(P)
            sP0, sP1 = P[0].shape
            for aa in xrange(1, aP):
                sP0aa, sP1aa = P[aa].shape
                if (sP0aa != sP0) or (sP1aa != sP1):
                    raise ValueError(mdperr["obj_square"])
        except AttributeError:
            raise TypeError(mdperr["P_shape"])
    except:
        raise
    # Checking R
    try:
        if R.ndim == 2:
            sR0, aR = R.shape
            sR1 = sR0
        elif R.ndim == 3:
            aR, sR0, sR1 = R.shape
214
215
216
        elif R.ndim == 1:
            # A hack so that we can go into the next try-except statement
            raise AttributeError
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
        else:
            raise ValueError(mdperr["R_shape"])
    except AttributeError:
        try:
            aR = len(R)
            sR0, sR1 = R[0].shape
            for aa in range(1, aR):
                sR0aa, sR1aa = R[aa].shape
                if ((sR0aa != sR0) or (sR1aa != sR1)):
                    raise ValueError(mdperr["obj_square"])
        except AttributeError:
            raise ValueError(mdperr["R_shape"])
    except:
        raise
    # Checking dimensions
    if (sP0 < 1) or (aP < 1) or (sP0 != sP1):
        raise ValueError(mdperr["P_shape"])   
    if (sR0 < 1) or (aR < 1) or (sR0 != sR1):
        raise ValueError(mdperr["R_shape"])
    if (sP0 != sR0) or (aP != aR):
        raise ValueError(mdperr["PR_incompat"])
    # Check that the P's are square and stochastic
    for aa in xrange(aP):
        checkSquareStochastic(P[aa])
        #checkSquareStochastic(P[aa, :, :])
    # We are at the end of the checks, so if no exceptions have been raised
    # then that means there are (hopefullly) no errors and we return None
    return None
    
    # These are the old code comments, which need to be converted to
    # information in the docstring:
    #
Steven Cordwell's avatar
Steven Cordwell committed
249
250
251
252
253
    # tranitions must be a numpy array either an AxSxS ndarray (with any 
    # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, 
    # and each element containing an SxS array. An AxSxS array will be
    # be converted to an object array. A numpy object array is similar to a
    # MATLAB cell array.
254
    #
Steven Cordwell's avatar
Steven Cordwell committed
255
256
257
258
259
260
261
262
263
264
    # NumPy has an array type of 'object', which is roughly equivalent to
    # the MATLAB cell array. These are most useful for storing sparse
    # matrices as these can only have two dimensions whereas we want to be
    # able to store a transition matrix for each action. If the dytpe of
    # the transition probability array is object then we store this as
    # P_is_object = True.
    # If it is an object array, then it should only have one dimension
    # otherwise fail with a message expalining why.
    # If it is a normal array then the number of dimensions must be exactly
    # three, otherwise fail with a message explaining why.
265
    #
Steven Cordwell's avatar
Steven Cordwell committed
266
267
    # As above but for the reward array. A difference is that the reward
    # array can have either two or 3 dimensions.
268
    #
Steven Cordwell's avatar
Steven Cordwell committed
269
270
271
272
273
    # We want to make sure that the transition probability array and the 
    # reward array are in agreement. This means that both should show that
    # there are the same number of actions and the same number of states.
    # Furthermore the probability of transition matrices must be SxS in
    # shape, so we check for that also.
274
    #
Steven Cordwell's avatar
Steven Cordwell committed
275
276
277
278
279
280
281
282
283
        # If the user has put their transition matrices into a numpy array
        # with dtype of 'object', then it is possible that they have made a
        # mistake and not all of the matrices are of the same shape. So,
        # here we record the number of actions and states that the first
        # matrix in element zero of the object array says it has. After
        # that we check that every other matrix also reports the same
        # number of actions and states, otherwise fail with an error.
        # aP: the number of actions in the transition array. This
        # corresponds to the number of elements in the object array.
284
        #
Steven Cordwell's avatar
Steven Cordwell committed
285
286
287
288
        # sP0: the number of states as reported by the number of rows of
        # the transition matrix
        # sP1: the number of states as reported by the number of columns of
        # the transition matrix
289
        #
Steven Cordwell's avatar
Steven Cordwell committed
290
291
        # Now we check to see that every element of the object array holds
        # a matrix of the same shape, otherwise fail.
292
        #
Steven Cordwell's avatar
Steven Cordwell committed
293
294
295
296
            # sp0aa and sp1aa represents the number of states in each
            # subsequent element of the object array. If it doesn't match
            # what was found in the first element, then we need to fail
            # telling the user what needs to be fixed.
297
            #
Steven Cordwell's avatar
Steven Cordwell committed
298
299
300
        # if we are using a normal array for this, then the first
        # dimension should be the number of actions, and the second and 
        # third should be the number of states
301
        #
Steven Cordwell's avatar
Steven Cordwell committed
302
303
304
305
306
307
    # the first dimension of the transition matrix must report the same
    # number of states as the second dimension. If not then we are not
    # dealing with a square matrix and it is not a valid transition
    # probability. Also, if the number of actions is less than one, or the
    # number of states is less than one, then it also is not a valid
    # transition probability.
308
    #
Steven Cordwell's avatar
Steven Cordwell committed
309
310
311
    # now we check that each transition matrix is square-stochastic. For
    # object arrays this is the matrix held in each element, but for
    # normal arrays this is a matrix formed by taking a slice of the array
312
    #
Steven Cordwell's avatar
Steven Cordwell committed
313
314
315
        # if the rewarad array has an object dtype, then we check that
        # each element contains a matrix of the same shape as we did 
        # above with the transition array.
316
        #
Steven Cordwell's avatar
Steven Cordwell committed
317
318
319
        # This indicates that the reward matrices are constructed per 
        # transition, so that the first dimension is the actions and
        # the second two dimensions are the states.
320
        #
Steven Cordwell's avatar
Steven Cordwell committed
321
322
        # then the reward matrix is per state, so the first dimension is 
        # the states and the second dimension is the actions.
323
        #
Steven Cordwell's avatar
Steven Cordwell committed
324
325
        # this is added just so that the next check doesn't error out
        # saying that sR1 doesn't exist
326
        #
Steven Cordwell's avatar
Steven Cordwell committed
327
328
    # the number of actions must be more than zero, the number of states
    # must also be more than 0, and the states must agree
329
    #
Steven Cordwell's avatar
Steven Cordwell committed
330
331
332
333
334
    # now we check to see that what the transition array is reporting and
    # what the reward arrar is reporting agree as to the number of actions
    # and states. If not then fail explaining the situation

def checkSquareStochastic(Z):
335
    """Check if Z is a square stochastic matrix.
Steven Cordwell's avatar
Steven Cordwell committed
336
    
337
338
    Let S = number of states.
    
Steven Cordwell's avatar
Steven Cordwell committed
339
340
    Parameters
    ----------
341
    Z : matrix
342
343
        This should be a two dimensional array with a shape of (S, S). It can
        possibly be sparse.
Steven Cordwell's avatar
Steven Cordwell committed
344
    
345
    Notes 
Steven Cordwell's avatar
Steven Cordwell committed
346
    ----------
347
    Returns None if no error has been detected, else it raises an error.
348
    
Steven Cordwell's avatar
Steven Cordwell committed
349
    """
Steven Cordwell's avatar
Steven Cordwell committed
350
351
352
353
354
355
356
357
    # try to get the shape of the matrix
    try:
        s1, s2 = Z.shape
    except AttributeError:
        raise TypeError("Matrix should be a numpy type.")
    except ValueError:
        raise ValueError(mdperr["mat_square"])
    # check that the matrix is square, and that each row sums to one
358
    if s1 != s2:
Steven Cordwell's avatar
Steven Cordwell committed
359
        raise ValueError(mdperr["mat_square"])
360
    elif (absolute(Z.sum(axis=1) - ones(s2))).max() > 10e-12:
Steven Cordwell's avatar
Steven Cordwell committed
361
        raise ValueError(mdperr["mat_stoch"])
Steven Cordwell's avatar
Steven Cordwell committed
362
363
364
365
366
367
368
369
370
371
    # make sure that there are no values less than zero
    try:
        if (Z < 0).any():
            raise ValueError(mdperr["mat_nonneg"])
    except AttributeError:
        try:
            if (Z.data < 0).any():
                raise ValueError(mdperr["mat_nonneg"])
        except AttributeError:
            raise TypeError("Matrix should be a numpy type.")
Steven Cordwell's avatar
Steven Cordwell committed
372
    except:
Steven Cordwell's avatar
Steven Cordwell committed
373
374
375
        raise
    
    return(None)
Steven Cordwell's avatar
Steven Cordwell committed
376

377
def exampleForest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
378
    """Generate a MDP example based on a simple forest management scenario.
Steven Cordwell's avatar
Steven Cordwell committed
379
    
380
381
382
383
384
385
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
386
    
Steven Cordwell's avatar
Steven Cordwell committed
387
    Here is how the problem is modelled.
388
389
390
    Let {1, 2 . . . ``S`` } be the states of the forest, with ``S`` being the 
    oldest. Let 'Wait' be action 1 and 'Cut' action 2.
    After a fire, the forest is in the youngest state, that is state 1.
Steven Cordwell's avatar
Steven Cordwell committed
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
    The transition matrix P of the problem can then be defined as follows::
        
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
        P[1,:,:] = | .  .  0  .       |
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
        
                   | 1 0..........0 |
                   | . .          . |
        P[2,:,:] = | . .          . |
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
    
    The reward matrix R is defined as follows::
        
                 |  0  |
                 |  .  |
        R[:,1] = |  .  |
                 |  .  |
                 |  0  |
                 |  r1 |
        
                 |  0  |
                 |  1  |
        R[:,2] = |  .  |
                 |  .  |
                 |  1  |
                 |  r2 |
Steven Cordwell's avatar
Steven Cordwell committed
422
423
424
    
    Parameters
    ---------
425
426
427
428
429
430
431
432
433
434
435
436
    S : int, optional
        The number of states, which should be an integer greater than 0. By
        default it is 3.
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
        performed. By default it is 4.
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
        performed. By default it is 2.
    p : float, optional
        The probability of wild fire occurence, in the range ]0, 1[. By default
        it is 0.1.
Steven Cordwell's avatar
Steven Cordwell committed
437
    
438
439
440
441
442
443
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
Steven Cordwell's avatar
Steven Cordwell committed
444
445
446
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
447
    >>> import mdp
Steven Cordwell's avatar
Steven Cordwell committed
448
449
450
451
452
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
Steven Cordwell's avatar
Steven Cordwell committed
453
    <BLANKLINE>
Steven Cordwell's avatar
Steven Cordwell committed
454
455
456
457
458
459
460
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
461
    
Steven Cordwell's avatar
Steven Cordwell committed
462
    """
463
    if S <= 1:
Steven Cordwell's avatar
Steven Cordwell committed
464
465
466
        raise ValueError(mdperr["S_gt_1"])
    if (r1 <= 0) or (r2 <= 0):
        raise ValueError(mdperr["R_gt_0"])
467
    if (p < 0) or (p > 1):
Steven Cordwell's avatar
Steven Cordwell committed
468
        raise ValueError(mdperr["prob_in01"])
469
470
    # Definition of Transition matrix P(:,:,1) associated to action Wait
    # (action 1) and P(:,:,2) associated to action Cut (action 2)
Steven Cordwell's avatar
Steven Cordwell committed
471
472
473
474
475
476
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
477
    if is_sparse:
478
479
480
481
482
483
484
485
486
        P = []
        rows = range(S) * 2
        cols = [0] * S + range(1, S) + [S - 1]
        vals = [0.1] * S + [0.9] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
        rows = range(S)
        cols = [0] * S
        vals = [1] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
487
488
489
490
491
492
493
    else:
        P = zeros((2, S, S))
        P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
        P[1, :, :] = zeros((S, S))
        P[1, :, 0] = 1
Steven Cordwell's avatar
Steven Cordwell committed
494
495
496
497
498
499
500
501
502
503
504
505
506
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
507
    # we want to return the generated transition and reward matrices
Steven Cordwell's avatar
Steven Cordwell committed
508
509
510
    return (P, R)

def exampleRand(S, A, is_sparse=False, mask=None):
511
    """Generate a random Markov Decision Process.
Steven Cordwell's avatar
Steven Cordwell committed
512
513
514
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
515
516
517
518
519
    S : int
        number of states (> 0)
    A : int
        number of actions (> 0)
    is_sparse : logical, optional
520
        false to have matrices in dense format, true to have sparse
Steven Cordwell's avatar
Steven Cordwell committed
521
522
523
524
        matrices (default false).
    mask : array or None, optional
        matrix with 0 and 1 (0 indicates a place for a zero
        probability), (SxS) (default, random)
Steven Cordwell's avatar
Steven Cordwell committed
525
526
    
    Returns
Steven Cordwell's avatar
Steven Cordwell committed
527
528
529
530
531
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
Steven Cordwell's avatar
Steven Cordwell committed
532
533
534
535
536

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
537
    
Steven Cordwell's avatar
Steven Cordwell committed
538
    """
539
    # making sure the states and actions are more than one
540
    if (S < 1) or (A < 1):
Steven Cordwell's avatar
Steven Cordwell committed
541
        raise ValueError(mdperr["SA_gt_1"])
542
    # if the user hasn't specified a mask, then we will make a random one now
543
    if mask is None:
Steven Cordwell's avatar
Steven Cordwell committed
544
        mask = rand(A, S, S)
545
546
        if is_sparse:
            # create a mask that has roughly two thirds of the cells set to 0
547
548
549
            #for a in range(A):
            mask[mask <= 2/3.0] = 0
            mask[mask > 2/3.0] = 1
550
        else:
Steven Cordwell's avatar
Steven Cordwell committed
551
            r = random()
552
553
            mask[mask < r] = 0
            mask[mask >= r] = 1
554
555
556
557
558
559
560
    else:
        # the mask needs to be SxS or AxSxS
        try:
            if mask.shape not in ((S, S), (A, S, S)):
                raise ValueError(mdperr["mask_SbyS"])
        except AttributeError:
            raise TypeError(mdperr["mask_numpy"])
561
    # generate the transition and reward matrices based on S, A and mask
Steven Cordwell's avatar
Steven Cordwell committed
562
563
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
564
        P = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
565
        # definition of reward matrix (values between -1 and +1)
566
        R = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
567
        for a in xrange(A):
568
569
            if mask.shape == (A, S, S):
                m = mask[a] # mask[a, :, :]
570
            else:
571
572
573
574
575
576
577
                m = mask
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
            PP = dok_matrix((S, S))
            RR = dok_matrix((S, S))
            for s in xrange(S):
578
                n = int(m[s].sum()) # m[s, :]
579
580
581
582
                if n == 0:
                    PP[s, randint(0, S)] = 1
                else:
                    rows = s * ones(n, dtype=int)
583
                    cols = where(m[s])[0] # m[s, :]
584
585
586
                    vals = rand(n)
                    vals = vals / vals.sum()
                    reward = 2*rand(n) - ones(n)
587
588
589
                    # I want to do this: PP[rows, cols] = vals, but it doesn't
                    # seem to work, as val[-1] is stored as the value for each
                    # row, column pair. Therefore the loop is needed.
590
591
                    for x in xrange(n):
                        PP[rows[x], cols[x]] = vals[x]
Steven Cordwell's avatar
typo    
Steven Cordwell committed
592
                        RR[rows[x], cols[x]] = reward[x]
593
594
595
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
596
597
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
Steven Cordwell's avatar
Steven Cordwell committed
598
599
600
601
602
603
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
604
605
606
607
            if mask.ndim == 3:
                P[a, :, :] = mask[a] * rand(S, S)
                for s in range(S):
                    if mask[a, s, :].sum() == 0:
608
                        P[a, s, randint(0, S)] = 1
609
610
611
612
613
614
615
                    P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
                R[a, :, :] = (mask[a, :, :] * (2*rand(S, S) -
                              ones((S, S), dtype=int)))
            else:
                P[a, :, :] = mask * rand(S, S)
                for s in range(S):
                    if mask[a, s, :].sum() == 0:
616
                        P[a, s, randint(0, S)] = 1
617
618
                    P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
                R[a, :, :] = mask * (2*rand(S, S) - ones((S, S), dtype=int))
619
    # we want to return the generated transition and reward matrices
Steven Cordwell's avatar
Steven Cordwell committed
620
621
    return (P, R)

Steven Cordwell's avatar
Steven Cordwell committed
622
def getSpan(W):
623
    """Return the span of W
624
625
    
    sp(W) = max W(s) - min W(s)
626
    
627
628
629
    """
    return (W.max() - W.min())

630

Steven Cordwell's avatar
Steven Cordwell committed
631
class MDP(object):
632
    
Steven Cordwell's avatar
Steven Cordwell committed
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
677
    
678
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
679
        """Initialise a MDP based on the input parameters."""
680
        
Steven Cordwell's avatar
Steven Cordwell committed
681
682
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
683
        if type(discount) in (int, float):
Steven Cordwell's avatar
Steven Cordwell committed
684
685
686
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
Steven Cordwell's avatar
Steven Cordwell committed
687
                if discount == 1:
688
689
690
                    print("PyMDPtoolbox WARNING: check conditions of "
                          "convergence. With no discount, convergence is not "
                          "always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
691
                self.discount = discount
692
        elif discount is not None:
693
694
            raise ValueError("PyMDPtoolbox: the discount must be a positive "
                             "real number less than or equal to one.")
695
        
Steven Cordwell's avatar
Steven Cordwell committed
696
697
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
698
699
        if type(max_iter) in (int, float):
            if max_iter <= 0:
Steven Cordwell's avatar
Steven Cordwell committed
700
701
702
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
703
        elif max_iter is not None:
704
705
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "
                             "number greater than zero.")
706
        
707
708
        if type(epsilon) in (int, float):
            if epsilon <= 0:
709
710
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
711
        elif epsilon is not None:
712
713
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
714
        
Steven Cordwell's avatar
Steven Cordwell committed
715
716
717
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
718
        
Steven Cordwell's avatar
Steven Cordwell committed
719
        # computePR will assign the variables self.S, self.A, self.P and self.R
720
        self._computePR(transitions, reward)
721
        
Steven Cordwell's avatar
Steven Cordwell committed
722
723
724
725
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
726
727
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
728
        
Steven Cordwell's avatar
Steven Cordwell committed
729
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
730
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
731
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
732
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
733
    
734
    def _bellmanOperator(self, V=None):
735
        """Apply the Bellman operator on the value function.
Steven Cordwell's avatar
Steven Cordwell committed
736
        
Steven Cordwell's avatar
Steven Cordwell committed
737
738
        Updates the value function and the Vprev-improving policy.
        
739
        Returns
Steven Cordwell's avatar
Steven Cordwell committed
740
        -------
741
        (policy, value) : tuple of new policy and its value
742
        
Steven Cordwell's avatar
Steven Cordwell committed
743
        """
744
745
        if V is None:
            # this V should be a reference to the data rather than a copy
746
747
            V = self.V
        else:
748
            try:
Steven Cordwell's avatar
Steven Cordwell committed
749
                if V.shape not in ((self.S,), (1, self.S)):
750
                    raise ValueError("bellman: V is not the right shape.")
751
752
            except AttributeError:
                raise TypeError("bellman: V must be a numpy array or matrix.")
753
        
Steven Cordwell's avatar
Steven Cordwell committed
754
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
755
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
756
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
757
        
758
        # Which way is better?
759
        # 1. Return, (policy, value)
760
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
761
762
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
763
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
764
    
765
    def _computePR(self, P, R):
766
        """Compute the reward for the system in one state chosing an action.
Steven Cordwell's avatar
Steven Cordwell committed
767
        
Steven Cordwell's avatar
Steven Cordwell committed
768
        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
769
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
770
771
772
773
774
775
776
777
        Let S = number of states, A = number of actions
            P(SxSxA)  = transition matrix 
                P could be an array with 3 dimensions or  a cell array (1xA), 
                each cell containing a matrix (SxS) possibly sparse
            R(SxSxA) or (SxA) = reward matrix
                R could be an array with 3 dimensions (SxSxA) or  a cell array 
                (1xA), each cell containing a sparse matrix (SxS) or a 2D 
                array(SxA) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
778
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
779
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
780
            PR(SxA)   = reward matrix
781
        
Steven Cordwell's avatar
Steven Cordwell committed
782
        """
783
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
784
        # check(P, R) has already been run and doesn't fail.
785
        #
786
787
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
Steven Cordwell's avatar
Steven Cordwell committed
788
        self.A = len(P)
789
790
791
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
Steven Cordwell's avatar
Steven Cordwell committed
792
793
            else:
               self.S = P[0].shape[0]
794
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
795
            self.S = P[0].shape[0]
796
797
798
799
800
        except:
            raise
        # convert Ps to matrices
        self.P = []
        for aa in xrange(self.A):
801
            self.P.append(P[aa])
802
        self.P = tuple(self.P)
Steven Cordwell's avatar
Steven Cordwell committed
803
804
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
805
        try:
806
            if R.ndim == 2:
807
808
                self.R = []
                for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
809
                    self.R.append(array(R[:, aa]).reshape(self.S))
Steven Cordwell's avatar
Steven Cordwell committed
810
            else:
811
812
813
814
                raise AttributeError
        except AttributeError:
            self.R = []
            for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
815
816
817
818
819
820
                try:
                    self.R.append(P[aa].multiply(R[aa]).sum(1).reshape(self.S))
                except AttributeError:
                    self.R.append(multiply(P[aa],R[aa]).sum(1).reshape(self.S))
                except:
                    raise
821
822
823
        except:
            raise
        self.R = tuple(self.R)
824
    
825
    def iterate(self):
826
        """Raise error because child classes should implement this function."""
Steven Cordwell's avatar
Steven Cordwell committed
827
        raise NotImplementedError("You should create an iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
828
    
Steven Cordwell's avatar
Steven Cordwell committed
829
    def setSilent(self):
830
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
831
832
833
        self.verbose = False
    
    def setVerbose(self):
834
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
835
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
836
837

class FiniteHorizon(MDP):
838
    
Steven Cordwell's avatar
Steven Cordwell committed
839
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
840
841
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
842
843
844
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
845
    P(SxSxA) = transition matrix 
846
847
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
848
849
850
851
852
853
854
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
855
856
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
857
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
858
859
860
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
861
862
863
864
865
866
867
868
869
870
871
872
873
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
874
    
Steven Cordwell's avatar
Steven Cordwell committed
875
876
877
878
879
880
881
882
883
884
885
886
887
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> fh = mdp.FiniteHorizon(P, R, 0.9, 3)
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
Steven Cordwell's avatar
Steven Cordwell committed
888
    """
Steven Cordwell's avatar
Steven Cordwell committed
889

Steven Cordwell's avatar
Steven Cordwell committed
890
    def __init__(self, transitions, reward, discount, N, h=None):
891
        """Initialise a finite horizon MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
892
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
893
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
894
        else:
Steven Cordwell's avatar
Steven Cordwell committed
895
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
896
        # Initialise the base class
897
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
898
899
        # remove the iteration counter, it is not meaningful for backwards
        # induction
900
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
901
        # There are value vectors for each time step up to the horizon
902
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
903
904
905
906
907
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
908
            self.V[:, N] = h
Steven Cordwell's avatar
Steven Cordwell committed
909
        
Steven Cordwell's avatar
Steven Cordwell committed
910
    def iterate(self):
911
        """Run the finite horizon algorithm."""
Steven Cordwell's avatar
Steven Cordwell committed
912
913
        self.time = time()
        
914
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
915
916
917
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
918
            if self.verbose:
919
920
                print("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
921
922
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
923
924
925
926
927
928
929
930
931
932
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
933
934

class LP(MDP):
935
    
936
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
937
938
939
940
941

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
942
943
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
963
    
Steven Cordwell's avatar
Steven Cordwell committed
964
    """
Steven Cordwell's avatar
Steven Cordwell committed
965

Steven Cordwell's avatar
Steven Cordwell committed
966
    def __init__(self, transitions, reward, discount):
967
        """Initialise a linear programming MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
968
969
970
        
        try:
            from cvxopt import matrix, solvers
971
972
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
973
        except ImportError:
974
975
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
976
        
Steven Cordwell's avatar
Steven Cordwell committed
977
        from scipy.sparse import eye as speye
978
        self._speye = speye
979
980
        
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
981
        
982
        # this doesn't do what I want it to do c.f. issue #3
983
984
        if not self.verbose:
            solvers.options['show_progress'] = False
985
986
987
988
    
    def iterate(self):
        """Run the linear programming algorithm."""
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
989
        # The objective is to resolve : min V / V >= PR + discount*P*V
990
991
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
992
        # min f'* x / M * x <= b
993
994
995
996
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
997
998
999
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
1000
1001
        for aa in range(self.A):
            pos = (aa + 1) * self.S
1002
1003
1004
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
1005
1006
1007
1008
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
        # only to 10e-8 places.
Steven Cordwell's avatar
Steven Cordwell committed
1009
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
1010
        
1011
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
1012
1013
        
        self.time = time() - self.time
1014
1015
        
        # store value and policy as tuples
1016
1017
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
1018
1019

class PolicyIteration(MDP):
1020
    
1021
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
1022
    
Steven Cordwell's avatar
Steven Cordwell committed
1023
1024
1025
1026
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
1027
1028
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
1053
1054
1055
1056
1057
1058
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    >>> pi = mdp.PolicyIteration(P, R, 0.9)
    >>> pi.iterate()
1059
    
Steven Cordwell's avatar
Steven Cordwell committed
1060
    """
Steven Cordwell's avatar
Steven Cordwell committed
1061
    
1062
1063
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
1064
        """Initialise a policy iteration MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
1065
        
1066
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
1067
        
Steven Cordwell's avatar
Steven Cordwell committed
1068
1069
1070
        if policy0 == None:
            # initialise the policy to the one which maximises the expected
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
1071
            self.V = zeros(self.S)
1072
            self.policy, null = self._bellmanOperator()
1073
            del null
Steven Cordwell's avatar
Steven Cordwell committed
1074
1075
1076
1077
        else:
            policy0 = array(policy0)
            
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
1078
1079
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
1080
            
Steven Cordwell's avatar
Steven Cordwell committed
1081
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1082
            
1083
1084
1085
1086
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
1087
1088
1089
1090
            else:
                self.policy = policy0
        
        # set or reset the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
1091
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1092
        
Steven Cordwell's avatar
Steven Cordwell committed
1093
        if eval_type in (0, "matrix"):
1094
            from numpy.linalg import solve
1095