mdp.py 55.8 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
Steven Cordwell's avatar
Steven Cordwell committed
2
"""Markov Decision Process (MDP) Toolbox
3
=====================================
4

Steven Cordwell's avatar
Steven Cordwell committed
5
6
The MDP toolbox provides classes and functions for the resolution of
descrete-time Markov Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28

Steven Cordwell's avatar
Steven Cordwell committed
29
30
31
32
33
34
35
36
37
38
Available functions
-------------------
check
    Check that an MDP is properly defined
checkSquareStochastic
    Check that a matrix is square and stochastic
exampleForest
    A simple forest management example
exampleRand
    A random example
Steven Cordwell's avatar
Steven Cordwell committed
39

Steven Cordwell's avatar
Steven Cordwell committed
40
41
42
43
44
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from 
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
45
examples assume that the `mdp` module has been imported imported like so::
Steven Cordwell's avatar
Steven Cordwell committed
46

47
  >>> import mdptoolbox.mdp as mdp
Steven Cordwell's avatar
Steven Cordwell committed
48
49
50
51
52

Code snippets are indicated by three greater-than signs::

  >>> x = 17
  >>> x = x + 1
53
54
  >>> x
  18
Steven Cordwell's avatar
Steven Cordwell committed
55
56
57
58
59

The documentation can be displayed with
`IPython <http://ipython.scipy.org>`_. For example, to view the docstring of
the ValueIteration class use ``mdp.ValueIteration?<ENTER>``, and to view its
source code use ``mdp.ValueIteration??<ENTER>``.
60

61
62
63
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at 
Steven Cordwell's avatar
Steven Cordwell committed
64
http://www.inra.fr/mia/T/MDPtoolbox/.
65

Steven Cordwell's avatar
Steven Cordwell committed
66
67
"""

68
69
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
97
98
99
from math import ceil, log, sqrt
from time import time

100
101
from numpy import absolute, array, empty, mean, mod, multiply
from numpy import ndarray, ones, zeros
102
from numpy.random import randint, random
Steven Cordwell's avatar
Steven Cordwell committed
103
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
104

105
from utils import check, getSpan
106

Steven Cordwell's avatar
Steven Cordwell committed
107
class MDP(object):
108
    
Steven Cordwell's avatar
Steven Cordwell committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
153
    
154
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
155
        # Initialise a MDP based on the input parameters.
156
        
Steven Cordwell's avatar
Steven Cordwell committed
157
158
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
159
160
161
162
163
164
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
                print("PyMDPtoolbox WARNING: check conditions of convergence. "
                      "With no discount, convergence is not always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
165
166
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
167
168
169
170
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
171
        # check that epsilon is something sane
172
173
174
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
175
176
177
178
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
179
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
180
181
182
183
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
184
185
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
186
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
187
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
188
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
189
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
190
    
191
192
193
194
195
196
197
198
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
        print(P_repr + "\n" + R_repr)
    
199
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
200
        # Apply the Bellman operator on the value function.
201
        # 
Steven Cordwell's avatar
Steven Cordwell committed
202
        # Updates the value function and the Vprev-improving policy.
203
        # 
Steven Cordwell's avatar
Steven Cordwell committed
204
205
206
207
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
208
209
        if V is None:
            # this V should be a reference to the data rather than a copy
210
211
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
212
            # make sure the user supplied V is of the right shape
213
            try:
214
215
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
216
            except AttributeError:
217
                raise TypeError("V must be a numpy array or matrix.")
218
219
220
221
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
Steven Cordwell's avatar
Steven Cordwell committed
222
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
223
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
224
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
225
        # Get the policy and value, for now it is being returned but...
226
        # Which way is better?
227
        # 1. Return, (policy, value)
228
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
229
230
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
231
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
232
    
233
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
249
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
250
        # check(P, R) has already been run and doesn't fail.
251
        #
252
253
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
Steven Cordwell's avatar
Steven Cordwell committed
254
        self.A = len(P)
255
256
257
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
Steven Cordwell's avatar
Steven Cordwell committed
258
259
            else:
               self.S = P[0].shape[0]
260
        except AttributeError:
Steven Cordwell's avatar
Steven Cordwell committed
261
            self.S = P[0].shape[0]
262
263
        # convert P to a tuple of numpy arrays
        self.P = tuple([P[aa] for aa in range(self.A)])
Steven Cordwell's avatar
Steven Cordwell committed
264
265
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
266
        try:
267
            if R.ndim == 2:
268
269
                self.R = tuple([array(R[:, aa]).reshape(self.S)
                                for aa in range(self.A)])
Steven Cordwell's avatar
Steven Cordwell committed
270
            else:
271
272
                self.R = tuple([multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in xrange(self.A)])
273
        except AttributeError:
274
275
            self.R = tuple([multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                            for aa in xrange(self.A)])
276
    
277
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
278
        # Raise error because child classes should implement this function.
279
        raise NotImplementedError("You should create an _iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
280
    
Steven Cordwell's avatar
Steven Cordwell committed
281
    def setSilent(self):
282
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
283
284
285
        self.verbose = False
    
    def setVerbose(self):
286
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
287
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
288
289

class FiniteHorizon(MDP):
290
    
Steven Cordwell's avatar
Steven Cordwell committed
291
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
292
293
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
294
295
296
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
297
    P(SxSxA) = transition matrix 
298
299
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
300
301
302
303
304
305
306
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
307
308
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
309
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
310
311
312
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
313
314
315
316
317
318
319
320
321
322
323
324
325
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
326
    
Steven Cordwell's avatar
Steven Cordwell committed
327
328
    Examples
    --------
329
330
331
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
Steven Cordwell's avatar
Steven Cordwell committed
332
333
334
335
336
337
338
339
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
340
    
Steven Cordwell's avatar
Steven Cordwell committed
341
    """
Steven Cordwell's avatar
Steven Cordwell committed
342

Steven Cordwell's avatar
Steven Cordwell committed
343
    def __init__(self, transitions, reward, discount, N, h=None):
344
        # Initialise a finite horizon MDP.
345
346
        self.N = int(N)
        assert self.N > 0, 'PyMDPtoolbox: N must be greater than 0.'
Steven Cordwell's avatar
Steven Cordwell committed
347
        # Initialise the base class
348
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
349
350
        # remove the iteration counter, it is not meaningful for backwards
        # induction
351
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
352
        # There are value vectors for each time step up to the horizon
353
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
354
355
356
357
358
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
359
            self.V[:, N] = h
360
361
362
363
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
364
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
365
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
366
        # loop through each time period
367
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
368
369
370
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
371
            if self.verbose:
372
373
                print("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
374
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
375
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
376
377
378
379
380
381
382
383
384
385
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
386
387

class LP(MDP):
388
    
389
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
390
391
392
393
394

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
395
396
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
416
417
418
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
419
    
Steven Cordwell's avatar
Steven Cordwell committed
420
    """
Steven Cordwell's avatar
Steven Cordwell committed
421

Steven Cordwell's avatar
Steven Cordwell committed
422
    def __init__(self, transitions, reward, discount):
423
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
424
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
425
426
        try:
            from cvxopt import matrix, solvers
427
428
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
429
        except ImportError:
430
431
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
432
433
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
434
        from scipy.sparse import eye as speye
435
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
436
        # initialise the MDP. epsilon and max_iter are not needed
437
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
438
        # Set the cvxopt solver to be quiet by default, but ...
439
        # this doesn't do what I want it to do c.f. issue #3
440
441
        if not self.verbose:
            solvers.options['show_progress'] = False
442
443
        # Call the iteration method
        self._iterate()
444
    
445
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
446
        #Run the linear programming algorithm.
447
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
448
        # The objective is to resolve : min V / V >= PR + discount*P*V
449
450
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
451
        # min f'* x / M * x <= b
452
453
454
455
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
456
457
458
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
459
460
        for aa in range(self.A):
            pos = (aa + 1) * self.S
461
462
463
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
464
465
466
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
467
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
468
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
469
        # apply the Bellman operator
470
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
471
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
472
        self.time = time() - self.time
473
        # store value and policy as tuples
474
475
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
476
477

class PolicyIteration(MDP):
478
    
479
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
480
    
Steven Cordwell's avatar
Steven Cordwell committed
481
482
483
484
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
485
486
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
511
512
    Examples
    --------
513
514
515
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
516
    
517
518
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
519
    >>> pi.V
520
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
521
    >>> pi.policy
522
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
523
    """
Steven Cordwell's avatar
Steven Cordwell committed
524
    
525
526
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
527
528
529
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
530
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
531
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
532
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
533
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
534
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
535
536
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
537
            del null
Steven Cordwell's avatar
Steven Cordwell committed
538
        else:
Steven Cordwell's avatar
Steven Cordwell committed
539
540
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
541
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
542
            # Make sure the policy is the right size and shape
Steven Cordwell's avatar
Steven Cordwell committed
543
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
544
545
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
546
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
547
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
548
            # The policy can only contain integers between 1 and S
549
550
551
552
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
553
554
            else:
                self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
555
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
556
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
557
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
558
        if eval_type in (0, "matrix"):
559
            from numpy.linalg import solve
560
            from scipy.sparse import eye
561
562
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
563
564
565
566
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
567
568
569
570
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
571
572
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
573
    
574
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
594
595
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
596
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
597
598
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
599
600
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
Steven Cordwell's avatar
Steven Cordwell committed
601
                Ppolicy[ind, :] = self.P[aa][ind, :]
602
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
603
604
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
605
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
606
607
608
609
610
611
612
613
614
615
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
616
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
647
        if (type(V0) in (int, float)) and (V0 == 0):
Steven Cordwell's avatar
Steven Cordwell committed
648
            policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
649
        else:
Steven Cordwell's avatar
Steven Cordwell committed
650
            if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)):
651
652
                policy_V = V0
            else:
653
654
655
                raise ValueError("PyMDPtoolbox: V0 vector/array type not "
                                 "supported. Use ndarray of matrix column "
                                 "vector length S.")
Steven Cordwell's avatar
Steven Cordwell committed
656
        
657
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
658
659
660
        
        if self.verbose:
            print('  Iteration    V_variation')
661
        
Steven Cordwell's avatar
Steven Cordwell committed
662
663
664
        itr = 0
        done = False
        while not done:
665
            itr += 1
666
667
            
            Vprev = policy_V
668
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
669
670
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
671
672
            if self.verbose:
                print('      %s         %s') % (itr, variation)
673
674
675
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
676
677
                done = True
                if self.verbose:
678
679
                    print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
                          "value function.")
Steven Cordwell's avatar
Steven Cordwell committed
680
681
682
            elif itr == max_iter:
                done = True
                if self.verbose:
683
684
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
Steven Cordwell's avatar
Steven Cordwell committed
685
        
Steven Cordwell's avatar
Steven Cordwell committed
686
        self.V = policy_V
687
    
688
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
708
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
709
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
710
711
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
712
    
713
    def _iterate(self):
Steven Cordwell's avatar
Steven Cordwell committed
714
715
        # Run the policy iteration algorithm.
        # If verbose the print a header
716
717
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
Steven Cordwell's avatar
Steven Cordwell committed
718
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
719
        done = False
720
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
721
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
722
        while not done:
723
            self.iter += 1
724
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
725
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
726
            if self.eval_type == "matrix":
727
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
728
            elif self.eval_type == "iterative":
729
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
730
731
            # This should update the classes policy attribute but leave the
            # value alone
732
            policy_next, null = self._bellmanOperator()
733
            del null
Steven Cordwell's avatar
Steven Cordwell committed
734
735
            # calculate in how many places does the old policy disagree with
            # the new policy
736
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
737
            # if verbose then continue printing a table
738
            if self.verbose:
739
740
                print('       %s                 %s') % (self.iter,
                                                         n_different)
Steven Cordwell's avatar
Steven Cordwell committed
741
742
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
743
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
744
                done = True
745
                if self.verbose:
746
747
                    print("PyMDPtoolbox: iterations stopped, unchanging "
                          "policy found.")
748
749
750
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
751
752
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
753
754
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
755
        # update the time to return th computation time
756
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
757
        # store value and policy as tuples
758
759
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
760

761
class PolicyIterationModified(PolicyIteration):
762
    
763
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
764
765
766
767
768
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
769
770
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
797
798
799
800
801
802
803
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
    >>> pim.policy
    FIXME
    >>> pim.V
    FIXME
804
    
Steven Cordwell's avatar
Steven Cordwell committed
805
    """
806
    
807
808
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
809
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
810
        
811
812
813
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
814
        # is needed from the PolicyIteration class is the _evalPolicyIterative
815
        # function. Perhaps there is a better way to do it?
816
817
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
818
        
819
820
821
822
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
        if type(epsilon) in (int, float):
            if epsilon <= 0:
823
824
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
825
        else:
826
827
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
828
        
829
830
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
831
832
833
834
835
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
836
837
        self.epsilon = epsilon
        
838
        if discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
839
            self.V = zeros((self.S, 1))
840
841
        else:
            # min(min()) is not right
842
            self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
843
844
845
        
        # Call the iteration method
        self._iterate()
Steven Cordwell's avatar
Steven Cordwell committed
846
    
847
    def _iterate(self):
848
        # Run the modified policy iteration algorithm.
849
850
851
852
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
853
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
854
        
855
856
        done = False
        while not done:
857
            self.iter += 1
858
            
859
            self.policy, Vnext = self._bellmanOperator()
860
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
861
            
862
            variation = getSpan(Vnext - self.V)
863
864
865
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
866
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
867
            if variation < self.thresh:
868
869
870
871
                done = True
            else:
                is_verbose = False
                if self.verbose:
872
                    self.setSilent()
873
874
                    is_verbose = True
                
875
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
876
877
                
                if is_verbose:
878
                    self.setVerbose()
879
        
880
        self.time = time() - self.time
881
882
        
        # store value and policy as tuples
883
884
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
885
886

class QLearning(MDP):
887
    
888
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
904
905
        Default value = 10000; it is an integer greater than the default
        value.
Steven Cordwell's avatar
Steven Cordwell committed
906
907
908
909
910
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
911
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
912
913
914
915
916
917
918
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

919
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
920
    ---------
921
922
923
    >>> # These examples are reproducible only if random seed is set to 0 in
    >>> # both the random and numpy.random modules.
    >>> import numpy as np
924
    >>> import mdptoolbox, mdptoolbox.example
925
    >>> np.random.seed(0)
926
927
    >>> P, R = mdptoolbox.example.forest()
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.96)
Steven Cordwell's avatar
Steven Cordwell committed
928
    >>> ql.Q
929
930
931
    array([[ 68.38037354,  43.24888454],
           [ 72.37777922,  42.75549145],
           [ 77.02892702,  64.68712932]])
Steven Cordwell's avatar
Steven Cordwell committed
932
    >>> ql.V
933
    (68.38037354422798, 72.37777921607258, 77.02892701616531)
Steven Cordwell's avatar
Steven Cordwell committed
934
    >>> ql.policy
935
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
936
    
937
    >>> import mdptoolbox
Steven Cordwell's avatar
Steven Cordwell committed
938
939
940
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
941
    >>> np.random.seed(0)
942
    >>> pim = mdptoolbox.mdp.QLearning(P, R, 0.9)
Steven Cordwell's avatar
Steven Cordwell committed
943
    >>> ql.Q
944
945
    array([[ 39.933691  ,  43.17543338],
           [ 36.94394224,  35.42568056]])
Steven Cordwell's avatar
Steven Cordwell committed
946
    >>> ql.V
947
    (43.17543338090149, 36.943942243204454)
Steven Cordwell's avatar
Steven Cordwell committed
948
    >>> ql.policy
949
    (1, 0)
950
    
Steven Cordwell's avatar
Steven Cordwell committed
951
952
953
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
954
        # Initialise a Q-learning MDP.
Steven Cordwell's avatar
Steven Cordwell committed
955
        
956
957
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
958
959
960
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "PyMDPtoolbox: n_iter should be " \
                                        "greater than 10000."
Steven Cordwell's avatar
Steven Cordwell committed
961
        
962
        # We don't want to send this to MDP because _computePR should not be
963
        # run on it, so check that it defines an MDP
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
        check(transitions, reward)
        
        if (transitions.dtype is object):
            self.P = transitions
            self.A = self.P.shape[0]
            self.S = self.P[0].shape[0]
        else: # convert to an object array
            self.A = transitions.shape[0]
            self.S = transitions.shape[1]
            self.P = zeros(self.A, dtype=object)
            for aa in range(self.A):
                self.P[aa] = transitions[aa, :, :]
        
        self.R = reward
        
        self.discount = discount
        
Steven Cordwell's avatar
Steven Cordwell committed
981
982
983
984
        # Initialisations
        self.Q = zeros((self.S, self.A))
        self.mean_discrepancy = []
        
985
986
987
988
        # Call the iteration method
        self._iterate()
        
    def _iterate(self):
989
        # Run the Q-learning algoritm.
990
991
        discrepancy = []
        
Steven Cordwell's avatar
Steven Cordwell committed
992
993
994
        self.time = time()
        
        # initial state choice
995
        s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
996
        
997
        for n in range(1, self.max_iter + 1):
Steven Cordwell's avatar
Steven Cordwell committed
998
999
1000
            
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
1001
                s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
1002
1003
1004
1005
1006
            
            # Action choice : greedy with increasing probability
            # probability 1-(1/log(n+2)) can be changed
            pn = random()
            if (pn < (1 - (1 / log(n + 2)))):