mdp.py 56.3 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
2
3
"""Markov Decision Process (MDP) Toolbox: ``mdp`` module
=====================================================
4

5
6
The ``mdp`` module provides classes for the resolution of descrete-time Markov
Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28
29
30

"""

31
32
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
60
61
62
from math import ceil, log, sqrt
from time import time

63
64
from numpy import absolute, array, empty, mean, mod, multiply
from numpy import ndarray, ones, zeros
65
from numpy.random import randint, random
Steven Cordwell's avatar
Steven Cordwell committed
66
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
67

Steven Cordwell's avatar
Steven Cordwell committed
68
from .utils import check, getSpan
69

Steven Cordwell's avatar
Steven Cordwell committed
70
class MDP(object):
71
    
Steven Cordwell's avatar
Steven Cordwell committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
116
    
117
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
118
        # Initialise a MDP based on the input parameters.
119
        
Steven Cordwell's avatar
Steven Cordwell committed
120
121
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
122
123
124
125
126
127
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
                print("PyMDPtoolbox WARNING: check conditions of convergence. "
                      "With no discount, convergence is not always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
128
129
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
130
131
132
133
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
134
        # check that epsilon is something sane
135
136
137
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
138
139
140
141
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
142
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
143
144
145
146
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
147
148
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
149
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
150
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
151
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
152
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
153
    
154
155
156
157
158
159
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
160
        return(P_repr + "\n" + R_repr)
161
    
162
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
163
        # Apply the Bellman operator on the value function.
164
        # 
Steven Cordwell's avatar
Steven Cordwell committed
165
        # Updates the value function and the Vprev-improving policy.
166
        # 
Steven Cordwell's avatar
Steven Cordwell committed
167
168
169
170
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
171
172
        if V is None:
            # this V should be a reference to the data rather than a copy
173
174
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
175
            # make sure the user supplied V is of the right shape
176
            try:
177
178
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
179
            except AttributeError:
180
                raise TypeError("V must be a numpy array or matrix.")
181
182
183
184
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
Steven Cordwell's avatar
Steven Cordwell committed
185
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
186
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
187
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
188
        # Get the policy and value, for now it is being returned but...
189
        # Which way is better?
190
        # 1. Return, (policy, value)
191
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
192
193
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
194
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
195
    
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    def _computeP(self, P):
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
        self.A = len(P)
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
            else:
               self.S = P[0].shape[0]
        except AttributeError:
            self.S = P[0].shape[0]
        # convert P to a tuple of numpy arrays
        self.P = tuple([P[aa] for aa in range(self.A)])
    
210
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
226
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
227
        # check(P, R) has already been run and doesn't fail.
228
        #
229
230
        # First compute store P, S, and A
        self._computeP(P)
Steven Cordwell's avatar
Steven Cordwell committed
231
232
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
233
        try:
234
235
236
237
            if R.ndim == 1:
                r = array(R).reshape(self.S)
                self.R = tuple([r for aa in range(self.A)])
            elif R.ndim == 2:
238
239
                self.R = tuple([array(R[:, aa]).reshape(self.S)
                                for aa in range(self.A)])
Steven Cordwell's avatar
Steven Cordwell committed
240
            else:
241
                self.R = tuple([multiply(P[aa], R[aa]).sum(1).reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
242
                                for aa in range(self.A)])
243
        except AttributeError:
244
245
246
247
248
249
            if len(R) == self.A:
                self.R = tuple([multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in range(self.A)])
            else:
                r = array(R).reshape(self.S)
                self.R = tuple([r for aa in range(self.A)])
250
    
251
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
252
        # Raise error because child classes should implement this function.
253
        raise NotImplementedError("You should create a run() method.")
Steven Cordwell's avatar
Steven Cordwell committed
254
    
Steven Cordwell's avatar
Steven Cordwell committed
255
    def setSilent(self):
256
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
257
258
259
        self.verbose = False
    
    def setVerbose(self):
260
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
261
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
262
263

class FiniteHorizon(MDP):
264
    
Steven Cordwell's avatar
Steven Cordwell committed
265
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
266
267
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
268
269
270
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
271
    P(SxSxA) = transition matrix 
272
273
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
274
275
276
277
278
279
280
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
281
282
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
283
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
284
285
286
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
287
288
289
290
291
292
293
294
295
296
297
298
299
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
300
    
Steven Cordwell's avatar
Steven Cordwell committed
301
302
    Examples
    --------
303
304
305
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
306
    >>> fh.run()
Steven Cordwell's avatar
Steven Cordwell committed
307
308
309
310
311
312
313
314
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
315
    
Steven Cordwell's avatar
Steven Cordwell committed
316
    """
Steven Cordwell's avatar
Steven Cordwell committed
317

Steven Cordwell's avatar
Steven Cordwell committed
318
    def __init__(self, transitions, reward, discount, N, h=None):
319
        # Initialise a finite horizon MDP.
320
321
        self.N = int(N)
        assert self.N > 0, 'PyMDPtoolbox: N must be greater than 0.'
Steven Cordwell's avatar
Steven Cordwell committed
322
        # Initialise the base class
323
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
324
325
        # remove the iteration counter, it is not meaningful for backwards
        # induction
326
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
327
        # There are value vectors for each time step up to the horizon
328
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
329
330
331
332
333
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
334
            self.V[:, N] = h
335
        # Call the iteration method
336
        #self.run()
337
        
338
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
339
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
340
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
341
        # loop through each time period
342
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
343
344
345
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
346
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
347
348
                print(("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist()))
Steven Cordwell's avatar
Steven Cordwell committed
349
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
350
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
351
352
353
354
355
356
357
358
359
360
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
361
362

class LP(MDP):
363
    
364
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
365
366
367
368
369

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
370
371
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
391
392
393
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
394
    >>> lp.run()
395
    
Steven Cordwell's avatar
Steven Cordwell committed
396
    """
Steven Cordwell's avatar
Steven Cordwell committed
397

Steven Cordwell's avatar
Steven Cordwell committed
398
    def __init__(self, transitions, reward, discount):
399
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
400
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
401
402
        try:
            from cvxopt import matrix, solvers
403
404
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
405
        except ImportError:
406
407
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
408
409
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
410
        from scipy.sparse import eye as speye
411
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
412
        # initialise the MDP. epsilon and max_iter are not needed
413
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
414
        # Set the cvxopt solver to be quiet by default, but ...
415
        # this doesn't do what I want it to do c.f. issue #3
416
417
        if not self.verbose:
            solvers.options['show_progress'] = False
418
        # Call the iteration method
419
        #self.run()
420
    
421
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
422
        #Run the linear programming algorithm.
423
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
424
        # The objective is to resolve : min V / V >= PR + discount*P*V
425
426
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
427
        # min f'* x / M * x <= b
428
429
430
431
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
432
433
434
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
435
436
        for aa in range(self.A):
            pos = (aa + 1) * self.S
437
438
439
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
440
441
442
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
443
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
444
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
445
        # apply the Bellman operator
446
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
447
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
448
        self.time = time() - self.time
449
        # store value and policy as tuples
450
451
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
452
453

class PolicyIteration(MDP):
454
    
455
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
456
    
Steven Cordwell's avatar
Steven Cordwell committed
457
458
459
460
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
461
462
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
487
488
    Examples
    --------
489
490
491
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
492
    >>> pi.run()
493
    
494
495
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
496
    >>> pi.run()
Steven Cordwell's avatar
Steven Cordwell committed
497
    >>> pi.V
498
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
499
    >>> pi.policy
500
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
501
    """
Steven Cordwell's avatar
Steven Cordwell committed
502
    
503
504
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
505
506
507
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
508
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
509
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
510
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
511
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
512
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
513
514
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
515
            del null
Steven Cordwell's avatar
Steven Cordwell committed
516
        else:
Steven Cordwell's avatar
Steven Cordwell committed
517
518
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
519
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
520
            # Make sure the policy is the right size and shape
Steven Cordwell's avatar
Steven Cordwell committed
521
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
522
523
                raise ValueError("PyMDPtolbox: policy0 must a vector with "
                                 "length S.")
Steven Cordwell's avatar
Steven Cordwell committed
524
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
525
            policy0 = policy0.reshape(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
526
            # The policy can only contain integers between 1 and S
527
528
529
530
            if (mod(policy0, 1).any() or (policy0 < 0).any() or
                    (policy0 >= self.S).any()):
                raise ValueError("PyMDPtoolbox: policy0 must be a vector of "
                                 "integers between 1 and S.")
Steven Cordwell's avatar
Steven Cordwell committed
531
532
            else:
                self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
533
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
534
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
535
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
536
        if eval_type in (0, "matrix"):
537
            from numpy.linalg import solve
538
            from scipy.sparse import eye
539
540
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
541
542
543
544
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
545
546
547
548
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
549
        # Call the iteration method
550
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
551
    
552
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
572
573
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
574
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
575
576
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
577
578
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
579
580
581
582
                try:
                    Ppolicy[ind, :] = self.P[aa][ind, :]
                except ValueError:
                    Ppolicy[ind, :] = self.P[aa][ind, :].todense()
583
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
584
585
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
586
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
587
588
589
590
591
592
593
594
595
596
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
597
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
628
        if (type(V0) in (int, float)) and (V0 == 0):
Steven Cordwell's avatar
Steven Cordwell committed
629
            policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
630
        else:
Steven Cordwell's avatar
Steven Cordwell committed
631
            if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)):
632
633
                policy_V = V0
            else:
634
635
636
                raise ValueError("PyMDPtoolbox: V0 vector/array type not "
                                 "supported. Use ndarray of matrix column "
                                 "vector length S.")
Steven Cordwell's avatar
Steven Cordwell committed
637
        
638
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
639
640
641
        
        if self.verbose:
            print('  Iteration    V_variation')
642
        
Steven Cordwell's avatar
Steven Cordwell committed
643
644
645
        itr = 0
        done = False
        while not done:
646
            itr += 1
647
648
            
            Vprev = policy_V
649
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
650
651
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
652
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
653
                print(('      %s         %s') % (itr, variation))
654
655
656
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
657
658
                done = True
                if self.verbose:
659
660
                    print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
                          "value function.")
Steven Cordwell's avatar
Steven Cordwell committed
661
662
663
            elif itr == max_iter:
                done = True
                if self.verbose:
664
665
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
Steven Cordwell's avatar
Steven Cordwell committed
666
        
Steven Cordwell's avatar
Steven Cordwell committed
667
        self.V = policy_V
668
    
669
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
689
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
690
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
691
692
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
693
    
694
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
695
696
        # Run the policy iteration algorithm.
        # If verbose the print a header
697
698
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
Steven Cordwell's avatar
Steven Cordwell committed
699
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
700
        done = False
701
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
702
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
703
        while not done:
704
            self.iter += 1
705
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
706
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
707
            if self.eval_type == "matrix":
708
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
709
            elif self.eval_type == "iterative":
710
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
711
712
            # This should update the classes policy attribute but leave the
            # value alone
713
            policy_next, null = self._bellmanOperator()
714
            del null
Steven Cordwell's avatar
Steven Cordwell committed
715
716
            # calculate in how many places does the old policy disagree with
            # the new policy
717
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
718
            # if verbose then continue printing a table
719
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
720
721
                print(('       %s                 %s') % (self.iter,
                                                         n_different))
Steven Cordwell's avatar
Steven Cordwell committed
722
723
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
724
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
725
                done = True
726
                if self.verbose:
727
728
                    print("PyMDPtoolbox: iterations stopped, unchanging "
                          "policy found.")
729
730
731
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
732
733
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
734
735
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
736
        # update the time to return th computation time
737
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
738
        # store value and policy as tuples
739
740
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
741

742
class PolicyIterationModified(PolicyIteration):
743
    
744
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
745
746
747
748
749
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
750
751
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
778
779
780
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
781
    >>> pim.run()
782
783
784
785
    >>> pim.policy
    FIXME
    >>> pim.V
    FIXME
786
    
Steven Cordwell's avatar
Steven Cordwell committed
787
    """
788
    
789
790
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
791
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
792
        
793
794
795
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
796
        # is needed from the PolicyIteration class is the _evalPolicyIterative
797
        # function. Perhaps there is a better way to do it?
798
799
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
800
        
801
802
803
804
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
        if type(epsilon) in (int, float):
            if epsilon <= 0:
805
806
                raise ValueError("PyMDPtoolbox: epsilon must be greater than "
                                 "0.")
807
        else:
808
809
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "
                             "number greater than zero.")
810
        
811
812
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
813
814
815
816
817
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
818
819
        self.epsilon = epsilon
        
820
        if discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
821
            self.V = zeros((self.S, 1))
822
823
        else:
            # min(min()) is not right
824
            self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
825
826
        
        # Call the iteration method
827
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
828
    
829
    def run(self):
830
        # Run the modified policy iteration algorithm.
831
832
        
        if self.verbose:
833
            print('\tIteration\tV-variation')
834
        
Steven Cordwell's avatar
Steven Cordwell committed
835
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
836
        
837
838
        done = False
        while not done:
839
            self.iter += 1
840
            
841
            self.policy, Vnext = self._bellmanOperator()
842
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
843
            
844
            variation = getSpan(Vnext - self.V)
845
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
846
                print(("\t%s\t%s" % (self.iter, variation)))
847
            
Steven Cordwell's avatar
Steven Cordwell committed
848
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
849
            if variation < self.thresh:
850
851
852
853
                done = True
            else:
                is_verbose = False
                if self.verbose:
854
                    self.setSilent()
855
856
                    is_verbose = True
                
857
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
858
859
                
                if is_verbose:
860
                    self.setVerbose()
861
        
862
        self.time = time() - self.time
863
864
        
        # store value and policy as tuples
865
866
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
867
868

class QLearning(MDP):
869
    
870
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
886
887
        Default value = 10000; it is an integer greater than the default
        value.
Steven Cordwell's avatar
Steven Cordwell committed
888
889
890
891
892
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
893
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
894
895
896
897
898
899
900
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

901
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
902
    ---------
903
904
905
    >>> # These examples are reproducible only if random seed is set to 0 in
    >>> # both the random and numpy.random modules.
    >>> import numpy as np
906
    >>> import mdptoolbox, mdptoolbox.example
907
    >>> np.random.seed(0)
908
909
    >>> P, R = mdptoolbox.example.forest()
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.96)
910
    >>> ql.run()
Steven Cordwell's avatar
Steven Cordwell committed
911
    >>> ql.Q
912
913
914
    array([[ 68.38037354,  43.24888454],
           [ 72.37777922,  42.75549145],
           [ 77.02892702,  64.68712932]])
Steven Cordwell's avatar
Steven Cordwell committed
915
    >>> ql.V
916
    (68.38037354422798, 72.37777921607258, 77.02892701616531)
Steven Cordwell's avatar
Steven Cordwell committed
917
    >>> ql.policy
918
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
919
    
920
    >>> import mdptoolbox
Steven Cordwell's avatar
Steven Cordwell committed
921
922
923
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
924
    >>> np.random.seed(0)
925
926
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.9)
    >>> ql.run()
Steven Cordwell's avatar
Steven Cordwell committed
927
    >>> ql.Q
928
929
    array([[ 39.933691  ,  43.17543338],
           [ 36.94394224,  35.42568056]])
Steven Cordwell's avatar
Steven Cordwell committed
930
    >>> ql.V
931
    (43.17543338090149, 36.943942243204454)
Steven Cordwell's avatar
Steven Cordwell committed
932
    >>> ql.policy
933
    (1, 0)
934
    
Steven Cordwell's avatar
Steven Cordwell committed
935
936
937
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
938
        # Initialise a Q-learning MDP.
Steven Cordwell's avatar
Steven Cordwell committed
939
        
940
941
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
942
943
944
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "PyMDPtoolbox: n_iter should be " \
                                        "greater than 10000."
Steven Cordwell's avatar
Steven Cordwell committed
945
        
946
        # We don't want to send this to MDP because _computePR should not be
947
        # run on it, so check that it defines an MDP
948
949
        check(transitions, reward)
        
950
951
        # Store P, S, and A
        self._computeP(transitions)
952
953
954
955
956
        
        self.R = reward
        
        self.discount = discount
        
Steven Cordwell's avatar
Steven Cordwell committed
957
958
959
960
        # Initialisations
        self.Q = zeros((self.S, self.A))
        self.mean_discrepancy = []
        
961
        # Call the iteration method
962
        #self.run()
963
        
964
    def run(self):
965
        # Run the Q-learning algoritm.
966
967
        discrepancy = []
        
Steven Cordwell's avatar
Steven Cordwell committed
968
969
970
        self.time = time()
        
        # initial state choice
971
        s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
972
        
973
        for n in range(1, self.max_iter + 1):
Steven Cordwell's avatar
Steven Cordwell committed
974
975
976
            
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
977
                s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
978
979
980
981
982
            
            # Action choice : greedy with increasing probability
            # probability 1-(1/log(n+2)) can be changed
            pn = random()
            if (pn < (1 - (1 / log(n + 2)))):
Steven Cordwell's avatar
Steven Cordwell committed
983
984
                # optimal_action = self.Q[s, :].max()
                a = self.Q[s, :].argmax()
Steven Cordwell's avatar
Steven Cordwell committed
985
            else:
986
                a = randint(0, self.A)
Steven Cordwell's avatar
Steven Cordwell committed
987
            
Steven Cordwell's avatar
Steven Cordwell committed
988
989
990
            # Simulating next state s_new and reward associated to <s,s_new,a>
            p_s_new = random()
            p = 0
Steven Cordwell's avatar
Steven Cordwell committed
991
            s_new = -1
992
            while ((p < p_s_new) and (s_new < (self.S - 1))):
Steven Cordwell's avatar
Steven Cordwell committed
993
                s_new = s_new + 1
Steven Cordwell's avatar
Steven Cordwell committed
994
995
                p = p + self.P[a][s, s_new]
            
996
            try:
Steven Cordwell's avatar
Steven Cordwell committed
997
                r = self.R[a][s, s_new]
998
            except IndexError: