mdp.py 57.6 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
Steven Cordwell's avatar
Steven Cordwell committed
2
"""Markov Decision Process (MDP) Toolbox
3
=====================================
4

Steven Cordwell's avatar
Steven Cordwell committed
5
6
The MDP toolbox provides classes and functions for the resolution of
descrete-time Markov Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28

Steven Cordwell's avatar
Steven Cordwell committed
29
30
31
32
33
34
35
36
37
38
Available functions
-------------------
check
    Check that an MDP is properly defined
checkSquareStochastic
    Check that a matrix is square and stochastic
exampleForest
    A simple forest management example
exampleRand
    A random example
Steven Cordwell's avatar
Steven Cordwell committed
39

Steven Cordwell's avatar
Steven Cordwell committed
40
41
42
43
44
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from 
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
45
examples assume that the `mdp` module has been imported imported like so::
Steven Cordwell's avatar
Steven Cordwell committed
46

47
  >>> import mdptoolbox.mdp as mdp
Steven Cordwell's avatar
Steven Cordwell committed
48
49
50
51
52

Code snippets are indicated by three greater-than signs::

  >>> x = 17
  >>> x = x + 1
53
54
  >>> x
  18
Steven Cordwell's avatar
Steven Cordwell committed
55
56
57
58
59

The documentation can be displayed with
`IPython <http://ipython.scipy.org>`_. For example, to view the docstring of
the ValueIteration class use ``mdp.ValueIteration?<ENTER>``, and to view its
source code use ``mdp.ValueIteration??<ENTER>``.
60

61
62
63
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at 
Steven Cordwell's avatar
Steven Cordwell committed
64
http://www.inra.fr/mia/T/MDPtoolbox/.
65

Steven Cordwell's avatar
Steven Cordwell committed
66
67
"""

68
69
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
97
from math import ceil, log, sqrt
98
from random import random
Steven Cordwell's avatar
Steven Cordwell committed
99
100
from time import time

101
102
103
from numpy import absolute, array, empty, mean, mod, multiply
from numpy import ndarray, ones, zeros
from numpy.random import randint
Steven Cordwell's avatar
Steven Cordwell committed
104
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
105

106
from utils import check, getSpan
107

Steven Cordwell's avatar
Steven Cordwell committed
108
# These need to be fixed so that we use classes derived from Error.
Steven Cordwell's avatar
Steven Cordwell committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
129
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
130
131
132
133
134
135
136
137
138
139
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
140
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
141
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
142
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
143
144
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
145
146
    "(S, A) with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
147
148
149
150
151
152
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
153
154
155
156
157
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
158
159
}

Steven Cordwell's avatar
Steven Cordwell committed
160
class MDP(object):
161
    
Steven Cordwell's avatar
Steven Cordwell committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
206
    
207
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
208
        """Initialise a MDP based on the input parameters."""
209
        
Steven Cordwell's avatar
Steven Cordwell committed
210
211
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
212
        if type(discount) in (int, float):
Steven Cordwell's avatar
Steven Cordwell committed
213
214
215
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
Steven Cordwell's avatar
Steven Cordwell committed
216
                if discount == 1:
217
218
219
                    print("PyMDPtoolbox WARNING: check conditions of "
                          "convergence. With no discount, convergence is not "
                          "always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
220
                self.discount = discount
221
        elif discount is not None:
222
223
            raise ValueError("PyMDPtoolbox: the discount must be a positive "
                             "real number less than or equal to one.")
Steven Cordwell's avatar
Steven Cordwell committed
224
225
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
226
227
        if type(max_iter) in (int, float):
            if max_iter <= 0:
Steven Cordwell's avatar
Steven Cordwell committed
228
229
230
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
231
        elif max_iter is not None:
232
233
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "
                             "number greater than zero.")
Steven Cordwell's avatar
Steven Cordwell committed
234
        # check that epsilon is something sane
235
236
        if type(epsilon) in (int, float):
            if epsilon <= 0:
237
238
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
239
        elif epsilon is not None:
240
241
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
Steven Cordwell's avatar
Steven Cordwell committed
242
243
244
245
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
246
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
247
248
249
250
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
251
252
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
253
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
254
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
255
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
256
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
257
    
258
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
259
260
261
262
263
264
        # Apply the Bellman operator on the value function.
        # Updates the value function and the Vprev-improving policy.
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
265
266
        if V is None:
            # this V should be a reference to the data rather than a copy
267
268
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
269
            # make sure the user supplied V is of the right shape
270
            try:
Steven Cordwell's avatar
Steven Cordwell committed
271
                if V.shape not in ((self.S,), (1, self.S)):
272
                    raise ValueError("bellman: V is not the right shape.")
273
274
            except AttributeError:
                raise TypeError("bellman: V must be a numpy array or matrix.")
Steven Cordwell's avatar
Steven Cordwell committed
275
        # Looping through each action the the Q-value matrix is calculated
Steven Cordwell's avatar
Steven Cordwell committed
276
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
277
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
278
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
279
        # Get the policy and value, for now it is being returned but...
280
        # Which way is better?
281
        # 1. Return, (policy, value)
282
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
283
284
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
285
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
286
    
287
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
303
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
304
        # check(P, R) has already been run and doesn't fail.
305
        #
306
307
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
Steven Cordwell's avatar
Steven Cordwell committed
308
        self.A = len(P)
309
310
311
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
Steven Cordwell's avatar
Steven Cordwell committed
312
313
            else:
               self.S = P[0].shape[0]
314
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
315
            self.S = P[0].shape[0]
316
317
318
319
320
        except:
            raise
        # convert Ps to matrices
        self.P = []
        for aa in xrange(self.A):
321
            self.P.append(P[aa])
322
        self.P = tuple(self.P)
Steven Cordwell's avatar
Steven Cordwell committed
323
324
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
325
        try:
326
            if R.ndim == 2:
327
328
                self.R = []
                for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
329
                    self.R.append(array(R[:, aa]).reshape(self.S))
Steven Cordwell's avatar
Steven Cordwell committed
330
            else:
331
332
333
334
                raise AttributeError
        except AttributeError:
            self.R = []
            for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
335
336
337
338
339
340
                try:
                    self.R.append(P[aa].multiply(R[aa]).sum(1).reshape(self.S))
                except AttributeError:
                    self.R.append(multiply(P[aa],R[aa]).sum(1).reshape(self.S))
                except:
                    raise
341
342
343
        except:
            raise
        self.R = tuple(self.R)
344
    
345
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
346
        # Raise error because child classes should implement this function.
347
        raise NotImplementedError("You should create an _iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
348
    
Steven Cordwell's avatar
Steven Cordwell committed
349
    def setSilent(self):
350
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
351
352
353
        self.verbose = False
    
    def setVerbose(self):
354
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
355
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
356
357

class FiniteHorizon(MDP):
358
    
Steven Cordwell's avatar
Steven Cordwell committed
359
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
360
361
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
362
363
364
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
365
    P(SxSxA) = transition matrix 
366
367
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
368
369
370
371
372
373
374
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
375
376
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
377
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
378
379
380
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
381
382
383
384
385
386
387
388
389
390
391
392
393
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
394
    
Steven Cordwell's avatar
Steven Cordwell committed
395
396
    Examples
    --------
397
398
399
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
Steven Cordwell's avatar
Steven Cordwell committed
400
401
402
403
404
405
406
407
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
408
    
Steven Cordwell's avatar
Steven Cordwell committed
409
    """
Steven Cordwell's avatar
Steven Cordwell committed
410

Steven Cordwell's avatar
Steven Cordwell committed
411
    def __init__(self, transitions, reward, discount, N, h=None):
412
        """Initialise a finite horizon MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
413
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
414
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
415
        else:
Steven Cordwell's avatar
Steven Cordwell committed
416
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
417
        # Initialise the base class
418
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
419
420
        # remove the iteration counter, it is not meaningful for backwards
        # induction
421
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
422
        # There are value vectors for each time step up to the horizon
423
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
424
425
426
427
428
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
429
            self.V[:, N] = h
430
431
432
433
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
434
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
435
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
436
        # loop through each time period
437
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
438
439
440
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
441
            if self.verbose:
442
443
                print("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
444
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
445
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
446
447
448
449
450
451
452
453
454
455
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
456
457

class LP(MDP):
458
    
459
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
460
461
462
463
464

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
465
466
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
486
487
488
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
489
    
Steven Cordwell's avatar
Steven Cordwell committed
490
    """
Steven Cordwell's avatar
Steven Cordwell committed
491

Steven Cordwell's avatar
Steven Cordwell committed
492
    def __init__(self, transitions, reward, discount):
493
        """Initialise a linear programming MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
494
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
495
496
        try:
            from cvxopt import matrix, solvers
497
498
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
499
        except ImportError:
500
501
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
502
503
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
504
        from scipy.sparse import eye as speye
505
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
506
        # initialise the MDP. epsilon and max_iter are not needed
507
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
508
        # Set the cvxopt solver to be quiet by default, but ...
509
        # this doesn't do what I want it to do c.f. issue #3
510
511
        if not self.verbose:
            solvers.options['show_progress'] = False
512
513
        # Call the iteration method
        self._iterate()
514
    
515
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
516
        #Run the linear programming algorithm.
517
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
518
        # The objective is to resolve : min V / V >= PR + discount*P*V
519
520
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
521
        # min f'* x / M * x <= b
522
523
524
525
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
526
527
528
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
529
530
        for aa in range(self.A):
            pos = (aa + 1) * self.S
531
532
533
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
534
535
536
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
537
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
538
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
539
        # apply the Bellman operator
540
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
541
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
542
        self.time = time() - self.time
543
        # store value and policy as tuples
544
545
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
546
547

class PolicyIteration(MDP):
548
    
549
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
550
    
Steven Cordwell's avatar
Steven Cordwell committed
551
552
553
554
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
555
556
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
581
582
    Examples
    --------
583
584
585
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
586
    
587
588
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
589
    >>> pi.V
590
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
591
    >>> pi.policy
592
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
593
    """
Steven Cordwell's avatar
Steven Cordwell committed
594
    
595
596
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
597
598
599
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
600
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
601
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
602
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
603
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
604
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
605
606
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
607
            del null
Steven Cordwell's avatar
Steven Cordwell committed
608
        else:
Steven Cordwell's avatar
Steven Cordwell committed
609
610
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
611
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
612
            # Make sure the policy is the right size and shape
Steven Cordwell's avatar
Steven Cordwell committed
613
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
614
615
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
616
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
617
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
618
            # The policy can only contain integers between 1 and S
619
620
621
622
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
623
624
            else:
                self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
625
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
626
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
627
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
628
        if eval_type in (0, "matrix"):
629
            from numpy.linalg import solve
630
            from scipy.sparse import eye
631
632
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
633
634
635
636
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
637
638
639
640
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
641
642
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
643
    
644
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
664
665
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
666
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
667
668
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
669
670
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
Steven Cordwell's avatar
Steven Cordwell committed
671
                Ppolicy[ind, :] = self.P[aa][ind, :]
672
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
673
674
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
675
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
676
677
678
679
680
681
682
683
684
685
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
686
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
717
        if (type(V0) in (int, float)) and (V0 == 0):
Steven Cordwell's avatar
Steven Cordwell committed
718
            policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
719
        else:
Steven Cordwell's avatar
Steven Cordwell committed
720
            if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)):
721
722
                policy_V = V0
            else:
723
724
725
                raise ValueError("PyMDPtoolbox: V0 vector/array type not "
                                 "supported. Use ndarray of matrix column "
                                 "vector length S.")
Steven Cordwell's avatar
Steven Cordwell committed
726
        
727
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
728
729
730
        
        if self.verbose:
            print('  Iteration    V_variation')
731
        
Steven Cordwell's avatar
Steven Cordwell committed
732
733
734
        itr = 0
        done = False
        while not done:
735
            itr += 1
736
737
            
            Vprev = policy_V
738
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
739
740
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
741
742
            if self.verbose:
                print('      %s         %s') % (itr, variation)
743
744
745
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
746
747
                done = True
                if self.verbose:
748
749
                    print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
                          "value function.")
Steven Cordwell's avatar
Steven Cordwell committed
750
751
752
            elif itr == max_iter:
                done = True
                if self.verbose:
753
754
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
Steven Cordwell's avatar
Steven Cordwell committed
755
        
Steven Cordwell's avatar
Steven Cordwell committed
756
        self.V = policy_V
757
    
758
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
778
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
779
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
780
781
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
782
    
783
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
784
785
        # Run the policy iteration algorithm.
        # If verbose the print a header
786
787
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
Steven Cordwell's avatar
Steven Cordwell committed
788
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
789
        done = False
790
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
791
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
792
        while not done:
793
            self.iter += 1
794
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
795
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
796
            if self.eval_type == "matrix":
797
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
798
            elif self.eval_type == "iterative":
799
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
800
801
            # This should update the classes policy attribute but leave the
            # value alone
802
            policy_next, null = self._bellmanOperator()
803
            del null
Steven Cordwell's avatar
Steven Cordwell committed
804
805
            # calculate in how many places does the old policy disagree with
            # the new policy
806
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
807
            # if verbose then continue printing a table
808
            if self.verbose:
809
810
                print('       %s                 %s') % (self.iter,
                                                         n_different)
Steven Cordwell's avatar
Steven Cordwell committed
811
812
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
813
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
814
                done = True
815
                if self.verbose:
816
817
                    print("PyMDPtoolbox: iterations stopped, unchanging "
                          "policy found.")
818
819
820
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
821
822
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
823
824
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
825
        # update the time to return th computation time
826
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
827
        # store value and policy as tuples
828
829
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
830

831
class PolicyIterationModified(PolicyIteration):
832
    
833
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
834
835
836
837
838
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
839
840
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
867
868
869
870
871
872
873
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
    >>> pim.policy
    FIXME
    >>> pim.V
    FIXME
874
    
Steven Cordwell's avatar
Steven Cordwell committed
875
    """
876
    
877
878
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
879
        """Initialise a (modified) policy iteration MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
880
        
881
882
883
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
884
        # is needed from the PolicyIteration class is the _evalPolicyIterative
885
        # function. Perhaps there is a better way to do it?
886
887
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
888
        
889
890
891
892
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
        if type(epsilon) in (int, float):
            if epsilon <= 0:
893
894
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
895
        else:
896
897
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
898
        
899
900
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
901
902
903
904
905
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
906
907
        self.epsilon = epsilon
        
908
        if discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
909
            self.V = zeros((self.S, 1))
910
911
        else:
            # min(min()) is not right
912
            self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
913
914
915
        
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
916
    
917
    def _iterate(self):
918
        """Run the modified policy iteration algorithm."""
919
920
921
922
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
923
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
924
        
925
926
        done = False
        while not done:
927
            self.iter += 1
928
            
929
            self.policy, Vnext = self._bellmanOperator()
930
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
931
            
932
            variation = getSpan(Vnext - self.V)
933
934
935
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
936
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
937
            if variation < self.thresh:
938
939
940
941
                done = True
            else:
                is_verbose = False
                if self.verbose:
942
                    self.setSilent()
943
944
                    is_verbose = True
                
945
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
946
947
                
                if is_verbose:
948
                    self.setVerbose()
949
        
950
        self.time = time() - self.time
951
952
        
        # store value and policy as tuples
953
954
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
955
956

class QLearning(MDP):
957
    
958
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
974
975
        Default value = 10000; it is an integer greater than the default
        value.
Steven Cordwell's avatar
Steven Cordwell committed
976
977
978
979
980
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
981
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
982
983
984
985
986
987
988
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

989
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
990
    ---------
991
992
993
994
    >>> # These examples are reproducible only if random seed is set to 0 in
    >>> # both the random and numpy.random modules.
    >>> import numpy as np
    >>> import random
995
    >>> import mdptoolbox, mdptoolbox.example
996
    >>> np.random.seed(0)
997
    >>> random.seed(0)
998
999
    >>> P, R = mdptoolbox.example.forest()
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.96)
Steven Cordwell's avatar
Steven Cordwell committed
1000
    >>> ql.Q
1001
1002
1003
    array([[ 68.38037354,  43.24888454],
           [ 72.37777922,  42.75549145],
           [ 77.02892702,  64.68712932]])
Steven Cordwell's avatar
Steven Cordwell committed
1004
    >>> ql.V
1005
    (68.38037354422798, 72.37777921607258, 77.02892701616531)
Steven Cordwell's avatar
Steven Cordwell committed
1006
    >>> ql.policy
1007
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
1008
    
1009
    >>> import mdptoolbox
1010
    >>> import random
Steven Cordwell's avatar
Steven Cordwell committed
1011
1012
1013
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
1014
    >>> np.random.seed(0)
1015
    >>> random.seed(0)
1016
    >>> pim = mdptoolbox.mdp.QLearning(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
1017
    >>> ql.Q
1018
1019
    array([[ 39.933691  ,  43.17543338],
           [ 36.94394224,  35.42568056]])
Steven Cordwell's avatar
Steven Cordwell committed
1020
    >>> ql.V
1021
    (43.17543338090149, 36.943942243204454)
Steven Cordwell's avatar
Steven Cordwell committed
1022
    >>> ql.policy
1023
    (1, 0)
1024
    
Steven Cordwell's avatar
Steven Cordwell committed
1025
1026
1027
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
1028
        """Initialise a Q-learning MDP."""
Steven Cordwell's avatar
Steven Cordwell committed
1029
        
1030
1031
1032
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        if (n_iter < 10000):
1033
1034
            raise ValueError("PyMDPtoolbox: n_iter should be greater than "
                             "10000.")
Steven Cordwell's avatar
Steven Cordwell committed
1035
        
1036
        # We don't want to send this to MDP because _computePR should not be
1037