mdp.py 56.7 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
Steven Cordwell's avatar
Steven Cordwell committed
2
"""Markov Decision Process (MDP) Toolbox
3
=====================================
4

Steven Cordwell's avatar
Steven Cordwell committed
5
6
The MDP toolbox provides classes and functions for the resolution of
descrete-time Markov Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28

Steven Cordwell's avatar
Steven Cordwell committed
29
30
31
32
33
34
35
36
37
38
Available functions
-------------------
check
    Check that an MDP is properly defined
checkSquareStochastic
    Check that a matrix is square and stochastic
exampleForest
    A simple forest management example
exampleRand
    A random example
Steven Cordwell's avatar
Steven Cordwell committed
39

Steven Cordwell's avatar
Steven Cordwell committed
40
41
42
43
44
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from 
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
45
examples assume that the `mdp` module has been imported imported like so::
Steven Cordwell's avatar
Steven Cordwell committed
46

47
  >>> import mdptoolbox.mdp as mdp
Steven Cordwell's avatar
Steven Cordwell committed
48
49
50
51
52

Code snippets are indicated by three greater-than signs::

  >>> x = 17
  >>> x = x + 1
53
54
  >>> x
  18
Steven Cordwell's avatar
Steven Cordwell committed
55
56
57
58
59

The documentation can be displayed with
`IPython <http://ipython.scipy.org>`_. For example, to view the docstring of
the ValueIteration class use ``mdp.ValueIteration?<ENTER>``, and to view its
source code use ``mdp.ValueIteration??<ENTER>``.
60

61
62
63
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at 
Steven Cordwell's avatar
Steven Cordwell committed
64
http://www.inra.fr/mia/T/MDPtoolbox/.
65

Steven Cordwell's avatar
Steven Cordwell committed
66
67
"""

68
69
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
97
98
99
from math import ceil, log, sqrt
from time import time

100
101
from numpy import absolute, array, empty, mean, mod, multiply
from numpy import ndarray, ones, zeros
102
from numpy.random import randint, random
Steven Cordwell's avatar
Steven Cordwell committed
103
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
104

105
from utils import check, getSpan
106

Steven Cordwell's avatar
Steven Cordwell committed
107
class MDP(object):
108
    
Steven Cordwell's avatar
Steven Cordwell committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
153
    
154
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
155
        # Initialise a MDP based on the input parameters.
156
        
Steven Cordwell's avatar
Steven Cordwell committed
157
158
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
159
        if type(discount) in (int, float):
Steven Cordwell's avatar
Steven Cordwell committed
160
            if (discount <= 0) or (discount > 1):
161
                raise ValueError("Discount rate must be in ]0; 1]")
Steven Cordwell's avatar
Steven Cordwell committed
162
            else:
Steven Cordwell's avatar
Steven Cordwell committed
163
                if discount == 1:
164
165
166
                    print("PyMDPtoolbox WARNING: check conditions of "
                          "convergence. With no discount, convergence is not "
                          "always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
167
                self.discount = discount
168
        elif discount is not None:
169
170
            raise ValueError("PyMDPtoolbox: the discount must be a positive "
                             "real number less than or equal to one.")
Steven Cordwell's avatar
Steven Cordwell committed
171
172
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
173
174
        if type(max_iter) in (int, float):
            if max_iter <= 0:
175
176
                raise ValueError("The maximum number of iterations must be "
                                 "greater than 0")
Steven Cordwell's avatar
Steven Cordwell committed
177
178
            else:
                self.max_iter = max_iter
179
        elif max_iter is not None:
180
181
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "
                             "number greater than zero.")
Steven Cordwell's avatar
Steven Cordwell committed
182
        # check that epsilon is something sane
183
184
        if type(epsilon) in (int, float):
            if epsilon <= 0:
185
186
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
187
        elif epsilon is not None:
188
189
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
Steven Cordwell's avatar
Steven Cordwell committed
190
191
192
193
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
194
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
195
196
197
198
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
199
200
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
201
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
202
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
203
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
204
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
205
    
206
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
207
        # Apply the Bellman operator on the value function.
208
        # 
Steven Cordwell's avatar
Steven Cordwell committed
209
        # Updates the value function and the Vprev-improving policy.
210
        # 
Steven Cordwell's avatar
Steven Cordwell committed
211
212
213
214
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
215
216
        if V is None:
            # this V should be a reference to the data rather than a copy
217
218
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
219
            # make sure the user supplied V is of the right shape
220
            try:
Steven Cordwell's avatar
Steven Cordwell committed
221
                if V.shape not in ((self.S,), (1, self.S)):
222
                    raise ValueError("bellman: V is not the right shape.")
223
224
            except AttributeError:
                raise TypeError("bellman: V must be a numpy array or matrix.")
225
226
227
228
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
Steven Cordwell's avatar
Steven Cordwell committed
229
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
230
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
231
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
232
        # Get the policy and value, for now it is being returned but...
233
        # Which way is better?
234
        # 1. Return, (policy, value)
235
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
236
237
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
238
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
239
    
240
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
256
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
257
        # check(P, R) has already been run and doesn't fail.
258
        #
259
260
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
Steven Cordwell's avatar
Steven Cordwell committed
261
        self.A = len(P)
262
263
264
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
Steven Cordwell's avatar
Steven Cordwell committed
265
266
            else:
               self.S = P[0].shape[0]
267
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
268
            self.S = P[0].shape[0]
269
270
271
272
273
        except:
            raise
        # convert Ps to matrices
        self.P = []
        for aa in xrange(self.A):
274
            self.P.append(P[aa])
275
        self.P = tuple(self.P)
Steven Cordwell's avatar
Steven Cordwell committed
276
277
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
278
        try:
279
            if R.ndim == 2:
280
281
                self.R = []
                for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
282
                    self.R.append(array(R[:, aa]).reshape(self.S))
Steven Cordwell's avatar
Steven Cordwell committed
283
            else:
284
285
286
287
                raise AttributeError
        except AttributeError:
            self.R = []
            for aa in xrange(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
288
289
290
291
292
293
                try:
                    self.R.append(P[aa].multiply(R[aa]).sum(1).reshape(self.S))
                except AttributeError:
                    self.R.append(multiply(P[aa],R[aa]).sum(1).reshape(self.S))
                except:
                    raise
294
295
296
        except:
            raise
        self.R = tuple(self.R)
297
    
298
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
299
        # Raise error because child classes should implement this function.
300
        raise NotImplementedError("You should create an _iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
301
    
Steven Cordwell's avatar
Steven Cordwell committed
302
    def setSilent(self):
303
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
304
305
306
        self.verbose = False
    
    def setVerbose(self):
307
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
308
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
309
310

class FiniteHorizon(MDP):
311
    
Steven Cordwell's avatar
Steven Cordwell committed
312
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
313
314
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
315
316
317
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
318
    P(SxSxA) = transition matrix 
319
320
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
321
322
323
324
325
326
327
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
328
329
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
330
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
331
332
333
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
334
335
336
337
338
339
340
341
342
343
344
345
346
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
347
    
Steven Cordwell's avatar
Steven Cordwell committed
348
349
    Examples
    --------
350
351
352
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
Steven Cordwell's avatar
Steven Cordwell committed
353
354
355
356
357
358
359
360
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
361
    
Steven Cordwell's avatar
Steven Cordwell committed
362
    """
Steven Cordwell's avatar
Steven Cordwell committed
363

Steven Cordwell's avatar
Steven Cordwell committed
364
    def __init__(self, transitions, reward, discount, N, h=None):
365
        # Initialise a finite horizon MDP.
Steven Cordwell's avatar
Steven Cordwell committed
366
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
367
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
368
        else:
Steven Cordwell's avatar
Steven Cordwell committed
369
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
370
        # Initialise the base class
371
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
372
373
        # remove the iteration counter, it is not meaningful for backwards
        # induction
374
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
375
        # There are value vectors for each time step up to the horizon
376
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
377
378
379
380
381
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
382
            self.V[:, N] = h
383
384
385
386
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
387
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
388
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
389
        # loop through each time period
390
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
391
392
393
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
394
            if self.verbose:
395
396
                print("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
397
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
398
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
399
400
401
402
403
404
405
406
407
408
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
409
410

class LP(MDP):
411
    
412
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
413
414
415
416
417

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
418
419
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
439
440
441
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
442
    
Steven Cordwell's avatar
Steven Cordwell committed
443
    """
Steven Cordwell's avatar
Steven Cordwell committed
444

Steven Cordwell's avatar
Steven Cordwell committed
445
    def __init__(self, transitions, reward, discount):
446
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
447
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
448
449
        try:
            from cvxopt import matrix, solvers
450
451
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
452
        except ImportError:
453
454
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
455
456
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
457
        from scipy.sparse import eye as speye
458
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
459
        # initialise the MDP. epsilon and max_iter are not needed
460
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
461
        # Set the cvxopt solver to be quiet by default, but ...
462
        # this doesn't do what I want it to do c.f. issue #3
463
464
        if not self.verbose:
            solvers.options['show_progress'] = False
465
466
        # Call the iteration method
        self._iterate()
467
    
468
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
469
        #Run the linear programming algorithm.
470
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
471
        # The objective is to resolve : min V / V >= PR + discount*P*V
472
473
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
474
        # min f'* x / M * x <= b
475
476
477
478
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
479
480
481
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
482
483
        for aa in range(self.A):
            pos = (aa + 1) * self.S
484
485
486
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
487
488
489
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
490
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
491
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
492
        # apply the Bellman operator
493
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
494
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
495
        self.time = time() - self.time
496
        # store value and policy as tuples
497
498
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
499
500

class PolicyIteration(MDP):
501
    
502
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
503
    
Steven Cordwell's avatar
Steven Cordwell committed
504
505
506
507
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
508
509
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
534
535
    Examples
    --------
536
537
538
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
539
    
540
541
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
542
    >>> pi.V
543
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
544
    >>> pi.policy
545
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
546
    """
Steven Cordwell's avatar
Steven Cordwell committed
547
    
548
549
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
550
551
552
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
553
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
554
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
555
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
556
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
557
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
558
559
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
560
            del null
Steven Cordwell's avatar
Steven Cordwell committed
561
        else:
Steven Cordwell's avatar
Steven Cordwell committed
562
563
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
564
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
565
            # Make sure the policy is the right size and shape
Steven Cordwell's avatar
Steven Cordwell committed
566
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
567
568
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
569
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
570
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
571
            # The policy can only contain integers between 1 and S
572
573
574
575
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
576
577
            else:
                self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
578
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
579
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
580
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
581
        if eval_type in (0, "matrix"):
582
            from numpy.linalg import solve
583
            from scipy.sparse import eye
584
585
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
586
587
588
589
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
590
591
592
593
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
594
595
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
596
    
597
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
617
618
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
619
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
620
621
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
622
623
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
Steven Cordwell's avatar
Steven Cordwell committed
624
                Ppolicy[ind, :] = self.P[aa][ind, :]
625
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
626
627
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
628
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
629
630
631
632
633
634
635
636
637
638
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
639
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
670
        if (type(V0) in (int, float)) and (V0 == 0):
Steven Cordwell's avatar
Steven Cordwell committed
671
            policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
672
        else:
Steven Cordwell's avatar
Steven Cordwell committed
673
            if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)):
674
675
                policy_V = V0
            else:
676
677
678
                raise ValueError("PyMDPtoolbox: V0 vector/array type not "
                                 "supported. Use ndarray of matrix column "
                                 "vector length S.")
Steven Cordwell's avatar
Steven Cordwell committed
679
        
680
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
681
682
683
        
        if self.verbose:
            print('  Iteration    V_variation')
684
        
Steven Cordwell's avatar
Steven Cordwell committed
685
686
687
        itr = 0
        done = False
        while not done:
688
            itr += 1
689
690
            
            Vprev = policy_V
691
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
692
693
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
694
695
            if self.verbose:
                print('      %s         %s') % (itr, variation)
696
697
698
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
699
700
                done = True
                if self.verbose:
701
702
                    print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
                          "value function.")
Steven Cordwell's avatar
Steven Cordwell committed
703
704
705
            elif itr == max_iter:
                done = True
                if self.verbose:
706
707
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
Steven Cordwell's avatar
Steven Cordwell committed
708
        
Steven Cordwell's avatar
Steven Cordwell committed
709
        self.V = policy_V
710
    
711
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
731
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
732
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
733
734
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
735
    
736
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
737
738
        # Run the policy iteration algorithm.
        # If verbose the print a header
739
740
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
Steven Cordwell's avatar
Steven Cordwell committed
741
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
742
        done = False
743
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
744
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
745
        while not done:
746
            self.iter += 1
747
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
748
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
749
            if self.eval_type == "matrix":
750
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
751
            elif self.eval_type == "iterative":
752
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
753
754
            # This should update the classes policy attribute but leave the
            # value alone
755
            policy_next, null = self._bellmanOperator()
756
            del null
Steven Cordwell's avatar
Steven Cordwell committed
757
758
            # calculate in how many places does the old policy disagree with
            # the new policy
759
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
760
            # if verbose then continue printing a table
761
            if self.verbose:
762
763
                print('       %s                 %s') % (self.iter,
                                                         n_different)
Steven Cordwell's avatar
Steven Cordwell committed
764
765
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
766
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
767
                done = True
768
                if self.verbose:
769
770
                    print("PyMDPtoolbox: iterations stopped, unchanging "
                          "policy found.")
771
772
773
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
774
775
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
776
777
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
778
        # update the time to return th computation time
779
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
780
        # store value and policy as tuples
781
782
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
783

784
class PolicyIterationModified(PolicyIteration):
785
    
786
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
787
788
789
790
791
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
792
793
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
820
821
822
823
824
825
826
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
    >>> pim.policy
    FIXME
    >>> pim.V
    FIXME
827
    
Steven Cordwell's avatar
Steven Cordwell committed
828
    """
829
    
830
831
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
832
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
833
        
834
835
836
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
837
        # is needed from the PolicyIteration class is the _evalPolicyIterative
838
        # function. Perhaps there is a better way to do it?
839
840
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
841
        
842
843
844
845
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
        if type(epsilon) in (int, float):
            if epsilon <= 0:
846
847
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
848
        else:
849
850
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
851
        
852
853
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
854
855
856
857
858
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
859
860
        self.epsilon = epsilon
        
861
        if discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
862
            self.V = zeros((self.S, 1))
863
864
        else:
            # min(min()) is not right
865
            self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
866
867
868
        
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
869
    
870
    def _iterate(self):
871
        # Run the modified policy iteration algorithm.
872
873
874
875
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
876
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
877
        
878
879
        done = False
        while not done:
880
            self.iter += 1
881
            
882
            self.policy, Vnext = self._bellmanOperator()
883
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
884
            
885
            variation = getSpan(Vnext - self.V)
886
887
888
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
889
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
890
            if variation < self.thresh:
891
892
893
894
                done = True
            else:
                is_verbose = False
                if self.verbose:
895
                    self.setSilent()
896
897
                    is_verbose = True
                
898
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
899
900
                
                if is_verbose:
901
                    self.setVerbose()
902
        
903
        self.time = time() - self.time
904
905
        
        # store value and policy as tuples
906
907
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
908
909

class QLearning(MDP):
910
    
911
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
927
928
        Default value = 10000; it is an integer greater than the default
        value.
Steven Cordwell's avatar
Steven Cordwell committed
929
930
931
932
933
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
934
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
935
936
937
938
939
940
941
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

942
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
943
    ---------
944
945
946
    >>> # These examples are reproducible only if random seed is set to 0 in
    >>> # both the random and numpy.random modules.
    >>> import numpy as np
947
    >>> import mdptoolbox, mdptoolbox.example
948
    >>> np.random.seed(0)
949
950
    >>> P, R = mdptoolbox.example.forest()
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.96)
Steven Cordwell's avatar
Steven Cordwell committed
951
    >>> ql.Q
952
953
954
    array([[ 68.38037354,  43.24888454],
           [ 72.37777922,  42.75549145],
           [ 77.02892702,  64.68712932]])
Steven Cordwell's avatar
Steven Cordwell committed
955
    >>> ql.V
956
    (68.38037354422798, 72.37777921607258, 77.02892701616531)
Steven Cordwell's avatar
Steven Cordwell committed
957
    >>> ql.policy
958
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
959
    
960
    >>> import mdptoolbox
Steven Cordwell's avatar
Steven Cordwell committed
961
962
963
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
964
    >>> np.random.seed(0)
965
    >>> pim = mdptoolbox.mdp.QLearning(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
966
    >>> ql.Q
967
968
    array([[ 39.933691  ,  43.17543338],
           [ 36.94394224,  35.42568056]])
Steven Cordwell's avatar
Steven Cordwell committed
969
    >>> ql.V
970
    (43.17543338090149, 36.943942243204454)
Steven Cordwell's avatar
Steven Cordwell committed
971
    >>> ql.policy
972
    (1, 0)
973
    
Steven Cordwell's avatar
Steven Cordwell committed
974
975
976
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
977
        # Initialise a Q-learning MDP.
Steven Cordwell's avatar
Steven Cordwell committed
978
        
979
980
981
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        if (n_iter < 10000):
982
983
            raise ValueError("PyMDPtoolbox: n_iter should be greater than "
                             "10000.")
Steven Cordwell's avatar
Steven Cordwell committed
984
        
985
        # We don't want to send this to MDP because _computePR should not be
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
        # run on it
        # MDP.__init__(self, transitions, reward, discount, None, n_iter)
        check(transitions, reward)
        
        if (transitions.dtype is object):
            self.P = transitions
            self.A = self.P.shape[0]
            self.S = self.P[0].shape[0]
        else: # convert to an object array
            self.A = transitions.shape[0]
            self.S = transitions.shape[1]
            self.P = zeros(self.A, dtype=object)
            for aa in range(self.A):
                self.P[aa] = transitions[aa, :, :]
        
        self.R = reward
        
        self.discount = discount
        
        self.max_iter = n_iter
Steven Cordwell's avatar
Steven Cordwell committed
1006
1007
1008
1009
1010
        
        # Initialisations
        self.Q = zeros((self.S, self.A))
        self.mean_discrepancy = []
        
1011
1012
1013
1014
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
1015
        # Run the Q-learning algoritm.
1016
1017
        discrepancy = []
        
Steven Cordwell's avatar
Steven Cordwell committed
1018
1019
1020
        self.time = time()
        
        # initial state choice
1021
        s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1022
        
1023
        for n in range(1, self.max_iter + 1):
Steven Cordwell's avatar
Steven Cordwell committed
1024
1025
1026
            
            # Reinitialisation of trajectories every 1