mdp.py 55.7 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1
# -*- coding: utf-8 -*-
2
3
"""Markov Decision Process (MDP) Toolbox: ``mdp`` module
=====================================================
4

5
6
The ``mdp`` module provides classes for the resolution of descrete-time Markov
Decision Processes.
Steven Cordwell's avatar
Steven Cordwell committed
7

Steven Cordwell's avatar
Steven Cordwell committed
8
9
10
11
12
Available classes
-----------------
MDP
    Base Markov decision process class
FiniteHorizon
Steven Cordwell's avatar
Steven Cordwell committed
13
    Backwards induction finite horizon MDP
Steven Cordwell's avatar
Steven Cordwell committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
LP
    Linear programming MDP
PolicyIteration
    Policy iteration MDP
PolicyIterationModified
    Modified policy iteration MDP
QLearning
    Q-learning MDP
RelativeValueIteration
    Relative value iteration MDP
ValueIteration
    Value iteration MDP
ValueIterationGS
    Gauss-Seidel value iteration MDP
Steven Cordwell's avatar
Steven Cordwell committed
28
29
30

"""

31
32
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
Steven Cordwell's avatar
Steven Cordwell committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

Steven Cordwell's avatar
Steven Cordwell committed
60
61
62
from math import ceil, log, sqrt
from time import time

63
from numpy import absolute, array, empty, mean, mod, multiply, ones, zeros
64
from numpy.random import randint, random
Steven Cordwell's avatar
Steven Cordwell committed
65
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
66

Steven Cordwell's avatar
Steven Cordwell committed
67
from .utils import check, getSpan
68

Steven Cordwell's avatar
Steven Cordwell committed
69
class MDP(object):
70
    
Steven Cordwell's avatar
Steven Cordwell committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
    """A Markov Decision Problem.
    
    Parameters
    ----------
    transitions : array
            transition probability matrices
    reward : array
            reward matrices
    discount : float or None
            discount factor
    epsilon : float or None
            stopping criteria
    max_iter : int or None
            maximum number of iterations
    
    Attributes
    ----------
    P : array
        Transition probability matrices
    R : array
        Reward matrices
    V : list
        Value function
    discount : float
        b
    max_iter : int
        a
    policy : list
        a
    time : float
        a
    verbose : logical
        a
    
    Methods
    -------
    iterate
        To be implemented in child classes, raises exception
    setSilent
        Turn the verbosity off
    setVerbose
        Turn the verbosity on
    
    """
Steven Cordwell's avatar
Steven Cordwell committed
115
    
116
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
117
        # Initialise a MDP based on the input parameters.
118
        
Steven Cordwell's avatar
Steven Cordwell committed
119
120
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
121
122
123
124
125
126
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
                print("PyMDPtoolbox WARNING: check conditions of convergence. "
                      "With no discount, convergence is not always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
127
128
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
129
130
131
132
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
133
        # check that epsilon is something sane
134
135
136
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."
Steven Cordwell's avatar
Steven Cordwell committed
137
138
139
140
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
        # computePR will assign the variables self.S, self.A, self.P and self.R
141
        self._computePR(transitions, reward)
Steven Cordwell's avatar
Steven Cordwell committed
142
143
144
145
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
146
147
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
148
        # V should be stored as a vector ie shape of (S,) or (1, S)
Steven Cordwell's avatar
Steven Cordwell committed
149
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
150
        # policy can also be stored as a vector
Steven Cordwell's avatar
Steven Cordwell committed
151
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
152
    
153
154
155
156
157
158
    def __repr__(self):
        P_repr = "P: \n"
        R_repr = "R: \n"
        for aa in range(self.A):
            P_repr += repr(self.P[aa]) + "\n"
            R_repr += repr(self.R[aa]) + "\n"
159
        return(P_repr + "\n" + R_repr)
160
    
161
    def _bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
162
        # Apply the Bellman operator on the value function.
163
        # 
Steven Cordwell's avatar
Steven Cordwell committed
164
        # Updates the value function and the Vprev-improving policy.
165
        # 
Steven Cordwell's avatar
Steven Cordwell committed
166
167
168
169
        # Returns: (policy, value), tuple of new policy and its value
        #
        # If V hasn't been sent into the method, then we assume to be working
        # on the objects V attribute
170
171
        if V is None:
            # this V should be a reference to the data rather than a copy
172
173
            V = self.V
        else:
Steven Cordwell's avatar
Steven Cordwell committed
174
            # make sure the user supplied V is of the right shape
175
            try:
176
177
                assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
                    "right shape (Bellman operator)."
178
            except AttributeError:
179
                raise TypeError("V must be a numpy array or matrix.")
180
181
182
183
        # Looping through each action the the Q-value matrix is calculated.
        # P and V can be any object that supports indexing, so it is important
        # that you know they define a valid MDP before calling the
        # _bellmanOperator method. Otherwise the results will be meaningless.
Steven Cordwell's avatar
Steven Cordwell committed
184
        Q = empty((self.A, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
185
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
186
            Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Steven Cordwell's avatar
Steven Cordwell committed
187
        # Get the policy and value, for now it is being returned but...
188
        # Which way is better?
189
        # 1. Return, (policy, value)
190
        return (Q.argmax(axis=0), Q.max(axis=0))
Steven Cordwell's avatar
Steven Cordwell committed
191
192
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
193
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
194
    
195
196
197
198
199
200
201
202
203
204
205
206
    def _computeP(self, P):
        # Set self.P as a tuple of length A, with each element storing an S×S
        # matrix.
        self.A = len(P)
        try:
            if P.ndim == 3:
                self.S = P.shape[1]
            else:
               self.S = P[0].shape[0]
        except AttributeError:
            self.S = P[0].shape[0]
        # convert P to a tuple of numpy arrays
207
        self.P = tuple(P[aa] for aa in range(self.A))
208
    
209
    def _computePR(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
        # Compute the reward for the system in one state chosing an action.
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        #    P(SxSxA)  = transition matrix 
        #        P could be an array with 3 dimensions or  a cell array (1xA), 
        #        each cell containing a matrix (SxS) possibly sparse
        #    R(SxSxA) or (SxA) = reward matrix
        #        R could be an array with 3 dimensions (SxSxA) or  a cell array 
        #        (1xA), each cell containing a sparse matrix (SxS) or a 2D 
        #        array(SxA) possibly sparse  
        # Evaluation
        # ----------
        #    PR(SxA)   = reward matrix
        #
225
        # We assume that P and R define a MDP i,e. assumption is that
Steven Cordwell's avatar
Steven Cordwell committed
226
        # check(P, R) has already been run and doesn't fail.
227
        #
228
229
        # First compute store P, S, and A
        self._computeP(P)
Steven Cordwell's avatar
Steven Cordwell committed
230
231
        # Set self.R as a tuple of length A, with each element storing an 1×S
        # vector.
232
        try:
233
234
            if R.ndim == 1:
                r = array(R).reshape(self.S)
235
                self.R = tuple(r for aa in range(self.A))
236
            elif R.ndim == 2:
237
238
                self.R = tuple(array(R[:, aa]).reshape(self.S)
                                for aa in range(self.A))
Steven Cordwell's avatar
Steven Cordwell committed
239
            else:
240
241
                self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in range(self.A))
242
        except AttributeError:
243
            if len(R) == self.A:
244
245
                self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S)
                                for aa in range(self.A))
246
247
            else:
                r = array(R).reshape(self.S)
248
                self.R = tuple(r for aa in range(self.A))
249
    
250
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
251
        # Raise error because child classes should implement this function.
252
        raise NotImplementedError("You should create a run() method.")
Steven Cordwell's avatar
Steven Cordwell committed
253
    
Steven Cordwell's avatar
Steven Cordwell committed
254
    def setSilent(self):
255
        """Set the MDP algorithm to silent mode."""
Steven Cordwell's avatar
Steven Cordwell committed
256
257
258
        self.verbose = False
    
    def setVerbose(self):
259
        """Set the MDP algorithm to verbose mode."""
Steven Cordwell's avatar
Steven Cordwell committed
260
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
261
262

class FiniteHorizon(MDP):
263
    
Steven Cordwell's avatar
Steven Cordwell committed
264
    """A MDP solved using the finite-horizon backwards induction algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
265
266
    
    Let S = number of states, A = number of actions
Steven Cordwell's avatar
Steven Cordwell committed
267
268
269
    
    Parameters
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
270
    P(SxSxA) = transition matrix 
271
272
             P could be an array with 3 dimensions ora cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
273
274
275
276
277
278
279
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
Steven Cordwell's avatar
Steven Cordwell committed
280
281
    
    Attributes
Steven Cordwell's avatar
Steven Cordwell committed
282
    ----------
Steven Cordwell's avatar
Steven Cordwell committed
283
284
285
    
    Methods
    -------
Steven Cordwell's avatar
Steven Cordwell committed
286
287
288
289
290
291
292
293
294
295
296
297
298
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.
299
    
Steven Cordwell's avatar
Steven Cordwell committed
300
301
    Examples
    --------
302
303
304
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
305
    >>> fh.run()
Steven Cordwell's avatar
Steven Cordwell committed
306
307
308
309
310
311
312
313
    >>> fh.V
    array([[ 2.6973,  0.81  ,  0.    ,  0.    ],
           [ 5.9373,  3.24  ,  1.    ,  0.    ],
           [ 9.9373,  7.24  ,  4.    ,  0.    ]])
    >>> fh.policy
    array([[0, 0, 0],
           [0, 0, 1],
           [0, 0, 0]])
314
    
Steven Cordwell's avatar
Steven Cordwell committed
315
    """
Steven Cordwell's avatar
Steven Cordwell committed
316

Steven Cordwell's avatar
Steven Cordwell committed
317
    def __init__(self, transitions, reward, discount, N, h=None):
318
        # Initialise a finite horizon MDP.
319
320
        self.N = int(N)
        assert self.N > 0, 'PyMDPtoolbox: N must be greater than 0.'
Steven Cordwell's avatar
Steven Cordwell committed
321
        # Initialise the base class
322
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
323
324
        # remove the iteration counter, it is not meaningful for backwards
        # induction
325
        del self.iter
Steven Cordwell's avatar
Steven Cordwell committed
326
        # There are value vectors for each time step up to the horizon
327
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
328
329
330
331
332
        # There are policy vectors for each time step before the horizon, when
        # we reach the horizon we don't need to make decisions anymore.
        self.policy = empty((self.S, N), dtype=int)
        # Set the reward for the final transition to h, if specified.
        if h is not None:
333
            self.V[:, N] = h
334
        # Call the iteration method
335
        #self.run()
336
        
337
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
338
        # Run the finite horizon algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
339
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
340
        # loop through each time period
341
        for n in range(self.N):
Steven Cordwell's avatar
Steven Cordwell committed
342
343
344
            W, X = self._bellmanOperator(self.V[:, self.N - n])
            self.V[:, self.N - n - 1] = X
            self.policy[:, self.N - n - 1] = W
Steven Cordwell's avatar
Steven Cordwell committed
345
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
346
347
                print(("stage: %s ... policy transpose : %s") % (
                    self.N - n, self.policy[:, self.N - n -1].tolist()))
Steven Cordwell's avatar
Steven Cordwell committed
348
        # update time spent running
Steven Cordwell's avatar
Steven Cordwell committed
349
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
350
351
352
353
354
355
356
357
358
359
        # After this we could create a tuple of tuples for the values and 
        # policies.
        #V = []
        #p = []
        #for n in xrange(self.N):
        #    V.append()
        #    p.append()
        #V.append()
        #self.V = tuple(V)
        #self.policy = tuple(p)
Steven Cordwell's avatar
Steven Cordwell committed
360
361

class LP(MDP):
362
    
363
    """A discounted MDP soloved using linear programming.
Steven Cordwell's avatar
Steven Cordwell committed
364
365
366
367
368

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
369
370
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
390
391
392
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> lp = mdptoolbox.mdp.LP(P, R, 0.9)
393
    >>> lp.run()
394
    
Steven Cordwell's avatar
Steven Cordwell committed
395
    """
Steven Cordwell's avatar
Steven Cordwell committed
396

Steven Cordwell's avatar
Steven Cordwell committed
397
    def __init__(self, transitions, reward, discount):
398
        # Initialise a linear programming MDP.
Steven Cordwell's avatar
Steven Cordwell committed
399
        # import some functions from cvxopt and set them as object methods
Steven Cordwell's avatar
Steven Cordwell committed
400
401
        try:
            from cvxopt import matrix, solvers
402
403
            self._linprog = solvers.lp
            self._cvxmat = matrix
Steven Cordwell's avatar
Steven Cordwell committed
404
        except ImportError:
405
406
            raise ImportError("The python module cvxopt is required to use "
                              "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
407
408
        # we also need diagonal matrices, and using a sparse one may be more
        # memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
409
        from scipy.sparse import eye as speye
410
        self._speye = speye
Steven Cordwell's avatar
Steven Cordwell committed
411
        # initialise the MDP. epsilon and max_iter are not needed
412
        MDP.__init__(self, transitions, reward, discount, None, None)
Steven Cordwell's avatar
Steven Cordwell committed
413
        # Set the cvxopt solver to be quiet by default, but ...
414
        # this doesn't do what I want it to do c.f. issue #3
415
416
        if not self.verbose:
            solvers.options['show_progress'] = False
417
        # Call the iteration method
418
        #self.run()
419
    
420
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
421
        #Run the linear programming algorithm.
422
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
423
        # The objective is to resolve : min V / V >= PR + discount*P*V
424
425
        # The function linprog of the optimisation Toolbox of Mathworks
        # resolves :
Steven Cordwell's avatar
Steven Cordwell committed
426
        # min f'* x / M * x <= b
427
428
429
430
        # So the objective could be expressed as :
        # min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions
        # M(A*S,S)
431
432
433
        f = self._cvxmat(ones((self.S, 1)))
        h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
        M = zeros((self.A * self.S, self.S))
Steven Cordwell's avatar
Steven Cordwell committed
434
435
        for aa in range(self.A):
            pos = (aa + 1) * self.S
436
437
438
            M[(pos - self.S):pos, :] = (
                self.discount * self.P[aa] - self._speye(self.S, self.S))
        M = self._cvxmat(M)
439
440
441
        # Using the glpk option will make this behave more like Octave
        # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the 
        # default cvxopt solver) then V agrees with the Octave equivalent
Steven Cordwell's avatar
Steven Cordwell committed
442
        # only to 10e-8 places. This assumes glpk is installed of course.
Steven Cordwell's avatar
Steven Cordwell committed
443
        self.V = array(self._linprog(f, M, -h, solver='glpk')['x'])
Steven Cordwell's avatar
Steven Cordwell committed
444
        # apply the Bellman operator
445
        self.policy, self.V =  self._bellmanOperator()
Steven Cordwell's avatar
Steven Cordwell committed
446
        # update the time spent solving
Steven Cordwell's avatar
Steven Cordwell committed
447
        self.time = time() - self.time
448
        # store value and policy as tuples
449
450
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
451
452

class PolicyIteration(MDP):
453
    
454
    """A discounted MDP solved using the policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
455
    
Steven Cordwell's avatar
Steven Cordwell committed
456
457
458
459
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
460
461
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
486
487
    Examples
    --------
488
489
490
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
491
    >>> pi.run()
492
    
493
494
    >>> P, R = mdptoolbox.example.forest()
    >>> pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
495
    >>> pi.run()
Steven Cordwell's avatar
Steven Cordwell committed
496
    >>> pi.V
497
    (26.244000000000018, 29.48400000000002, 33.484000000000016)
Steven Cordwell's avatar
Steven Cordwell committed
498
    >>> pi.policy
499
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
500
    """
Steven Cordwell's avatar
Steven Cordwell committed
501
    
502
503
    def __init__(self, transitions, reward, discount, policy0=None,
                 max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
504
505
506
        # Initialise a policy iteration MDP.
        #
        # Set up the MDP, but don't need to worry about epsilon values
507
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
508
        # Check if the user has supplied an initial policy. If not make one.
Steven Cordwell's avatar
Steven Cordwell committed
509
        if policy0 == None:
Steven Cordwell's avatar
Steven Cordwell committed
510
            # Initialise the policy to the one which maximises the expected
Steven Cordwell's avatar
Steven Cordwell committed
511
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
512
513
            null = zeros(self.S)
            self.policy, null = self._bellmanOperator(null)
514
            del null
Steven Cordwell's avatar
Steven Cordwell committed
515
        else:
Steven Cordwell's avatar
Steven Cordwell committed
516
517
            # Use the policy that the user supplied
            # Make sure it is a numpy array
Steven Cordwell's avatar
Steven Cordwell committed
518
            policy0 = array(policy0)
Steven Cordwell's avatar
Steven Cordwell committed
519
            # Make sure the policy is the right size and shape
520
521
            assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'policy0' must a vector with length S."
Steven Cordwell's avatar
Steven Cordwell committed
522
            # reshape the policy to be a vector
Steven Cordwell's avatar
Steven Cordwell committed
523
            policy0 = policy0.reshape(self.S)
524
525
526
527
528
529
            # The policy can only contain integers between 0 and S-1
            msg = "'policy0' must be a vector of integers between 0 and S-1."
            assert not mod(policy0, 1).any(), msg
            assert (policy0 >= 0).all(), msg
            assert (policy0 < self.S).all(), msg
            self.policy = policy0
Steven Cordwell's avatar
Steven Cordwell committed
530
        # set the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
531
        self.V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
532
        # Do some setup depending on the evaluation type
Steven Cordwell's avatar
Steven Cordwell committed
533
        if eval_type in (0, "matrix"):
534
            from numpy.linalg import solve
535
            from scipy.sparse import eye
536
537
            self._speye = eye
            self._lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
538
539
540
541
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
542
543
544
545
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
                             "evaluation or 1 for iterative evaluation. "
                             "The strings 'matrix' and 'iterative' can also "
                             "be used.")
546
        # Call the iteration method
547
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
548
    
549
    def _computePpolicyPRpolicy(self):
Steven Cordwell's avatar
Steven Cordwell committed
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
        # Compute the transition matrix and the reward matrix for a policy.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #     P could be an array with 3 dimensions or a cell array (1xA),
        #     each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #     R could be an array with 3 dimensions (SxSxA) or 
        #     a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #     a 2D array(SxA) possibly sparse  
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Ppolicy(SxS)  = transition matrix for policy
        # PRpolicy(S)   = reward matrix for policy
        #
Steven Cordwell's avatar
Steven Cordwell committed
569
570
        Ppolicy = empty((self.S, self.S))
        Rpolicy = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
571
        for aa in range(self.A): # avoid looping over S
Steven Cordwell's avatar
Steven Cordwell committed
572
573
            # the rows that use action a.
            ind = (self.policy == aa).nonzero()[0]
574
575
            # if no rows use action a, then no need to assign this
            if ind.size > 0:
576
577
578
579
                try:
                    Ppolicy[ind, :] = self.P[aa][ind, :]
                except ValueError:
                    Ppolicy[ind, :] = self.P[aa][ind, :].todense()
580
                #PR = self._computePR() # an apparently uneeded line, and
Steven Cordwell's avatar
Steven Cordwell committed
581
582
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
583
                Rpolicy[ind] = self.R[aa][ind]
Steven Cordwell's avatar
Steven Cordwell committed
584
585
586
587
588
589
590
591
592
593
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
594
    def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
Steven Cordwell's avatar
Steven Cordwell committed
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
        # Evaluate a policy using iteration.
        #
        # Arguments
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA)  = transition matrix 
        #    P could be an array with 3 dimensions or 
        #    a cell array (1xS), each cell containing a matrix possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #    R could be an array with 3 dimensions (SxSxA) or 
        #    a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #    a 2D array(SxA) possibly sparse  
        # discount  = discount rate in ]0; 1[
        # policy(S) = a policy
        # V0(S)     = starting value function, optional (default : zeros(S,1))
        # epsilon   = epsilon-optimal policy search, upper than 0,
        #    optional (default : 0.0001)
        # max_iter  = maximum number of iteration to be done, upper than 0, 
        #    optional (default : 10000)
        #    
        # Evaluation
        # ----------
        # Vpolicy(S) = value function, associated to a specific policy
        #
        # Notes
        # -----
        # In verbose mode, at each iteration, displays the condition which
        # stopped iterations: epsilon-optimum value function found or maximum
        # number of iterations reached.
        #
625
626
627
628
629
630
631
        try:
            assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
                "'V0' must be a vector of length S."
            policy_V = array(V0).reshape(self.S)
        except AttributeError:
            if len(V0) == self.S:
                policy_V = array(V0).reshape(self.S)
632
            else:
633
                policy_V = zeros(self.S)
Steven Cordwell's avatar
Steven Cordwell committed
634
        
635
        policy_P, policy_R = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
636
637
638
        
        if self.verbose:
            print('  Iteration    V_variation')
639
        
Steven Cordwell's avatar
Steven Cordwell committed
640
641
642
        itr = 0
        done = False
        while not done:
643
            itr += 1
644
645
            
            Vprev = policy_V
646
            policy_V = policy_R + self.discount * policy_P.dot(Vprev)
647
648
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
649
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
650
                print(('      %s         %s') % (itr, variation))
651
652
653
            
            # ensure |Vn - Vpolicy| < epsilon
            if variation < ((1 - self.discount) / self.discount) * epsilon:
Steven Cordwell's avatar
Steven Cordwell committed
654
655
                done = True
                if self.verbose:
656
657
                    print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
                          "value function.")
Steven Cordwell's avatar
Steven Cordwell committed
658
659
660
            elif itr == max_iter:
                done = True
                if self.verbose:
661
662
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
Steven Cordwell's avatar
Steven Cordwell committed
663
        
Steven Cordwell's avatar
Steven Cordwell committed
664
        self.V = policy_V
665
    
666
    def _evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
        # Evaluate the value function of the policy using linear equations.
        #
        # Arguments 
        # ---------
        # Let S = number of states, A = number of actions
        # P(SxSxA) = transition matrix 
        #      P could be an array with 3 dimensions or a cell array (1xA),
        #      each cell containing a matrix (SxS) possibly sparse
        # R(SxSxA) or (SxA) = reward matrix
        #      R could be an array with 3 dimensions (SxSxA) or 
        #      a cell array (1xA), each cell containing a sparse matrix (SxS) or
        #      a 2D array(SxA) possibly sparse  
        # discount = discount rate in ]0; 1[
        # policy(S) = a policy
        #
        # Evaluation
        # ----------
        # Vpolicy(S) = value function of the policy
        #
686
        Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
687
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
688
689
        self.V = self._lin_eq(
            (self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
690
    
691
    def run(self):
Steven Cordwell's avatar
Steven Cordwell committed
692
693
        # Run the policy iteration algorithm.
        # If verbose the print a header
694
695
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
Steven Cordwell's avatar
Steven Cordwell committed
696
        # Set up the while stopping condition and the current time
Steven Cordwell's avatar
Steven Cordwell committed
697
        done = False
698
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
699
        # loop until a stopping condition is reached
Steven Cordwell's avatar
Steven Cordwell committed
700
        while not done:
701
            self.iter += 1
702
            # these _evalPolicy* functions will update the classes value
Steven Cordwell's avatar
Steven Cordwell committed
703
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
704
            if self.eval_type == "matrix":
705
                self._evalPolicyMatrix()
Steven Cordwell's avatar
Steven Cordwell committed
706
            elif self.eval_type == "iterative":
707
                self._evalPolicyIterative()
Steven Cordwell's avatar
Steven Cordwell committed
708
709
            # This should update the classes policy attribute but leave the
            # value alone
710
            policy_next, null = self._bellmanOperator()
711
            del null
Steven Cordwell's avatar
Steven Cordwell committed
712
713
            # calculate in how many places does the old policy disagree with
            # the new policy
714
            n_different = (policy_next != self.policy).sum()
Steven Cordwell's avatar
Steven Cordwell committed
715
            # if verbose then continue printing a table
716
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
717
718
                print(('       %s                 %s') % (self.iter,
                                                         n_different))
Steven Cordwell's avatar
Steven Cordwell committed
719
720
            # Once the policy is unchanging of the maximum number of 
            # of iterations has been reached then stop
721
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
722
                done = True
723
                if self.verbose:
724
725
                    print("PyMDPtoolbox: iterations stopped, unchanging "
                          "policy found.")
726
727
728
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
729
730
                    print("PyMDPtoolbox: iterations stopped by maximum number "
                          "of iteration condition.")
731
732
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
733
        # update the time to return th computation time
734
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
735
        # store value and policy as tuples
736
737
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
738

739
class PolicyIterationModified(PolicyIteration):
740
    
741
    """A discounted MDP  solved using a modifified policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
742
743
744
745
746
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
747
748
             P could be an array with 3 dimensions or a cell array (1xA),
             each cell containing a matrix (SxS) possibly sparse
Steven Cordwell's avatar
Steven Cordwell committed
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
775
776
777
    >>> import mdptoolbox, mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
    >>> pim = mdptoolbox.mdp.PolicyIterationModified(P, R, 0.9)
778
    >>> pim.run()
779
    >>> pim.policy
780
    (0, 0, 0)
781
    >>> pim.V
782
    (21.81408652334702, 25.054086523347017, 29.054086523347017)
783
    
Steven Cordwell's avatar
Steven Cordwell committed
784
    """
785
    
786
787
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=10):
788
        # Initialise a (modified) policy iteration MDP.
Steven Cordwell's avatar
Steven Cordwell committed
789
        
790
791
792
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
793
        # is needed from the PolicyIteration class is the _evalPolicyIterative
794
        # function. Perhaps there is a better way to do it?
795
796
        PolicyIteration.__init__(self, transitions, reward, discount, None,
                                 max_iter, 1)
797
        
798
799
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
800
801
        self.epsilon = float(epsilon)
        assert epsilon > 0, "'epsilon' must be greater than 0."
802
        
803
804
        # computation of threshold of variation for V for an epsilon-optimal
        # policy
805
        if self.discount != 1:
806
            self.thresh = self.epsilon * (1 - self.discount) / self.discount
807
        else:
808
            self.thresh = self.epsilon
809
        
810
        if self.discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
811
            self.V = zeros((self.S, 1))
812
        else:
813
            Rmin = min(R.min() for R in self.R)
814
            self.V = 1 / (1 - self.discount) * Rmin * ones((self.S,))
815
816
        
        # Call the iteration method
817
        #self.run()
Steven Cordwell's avatar
Steven Cordwell committed
818
    
819
    def run(self):
820
        # Run the modified policy iteration algorithm.
821
822
        
        if self.verbose:
823
            print('\tIteration\tV-variation')
824
        
Steven Cordwell's avatar
Steven Cordwell committed
825
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
826
        
827
828
        done = False
        while not done:
829
            self.iter += 1
830
            
831
            self.policy, Vnext = self._bellmanOperator()
832
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
833
            
834
            variation = getSpan(Vnext - self.V)
835
            if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
836
                print(("\t%s\t%s" % (self.iter, variation)))
837
            
Steven Cordwell's avatar
Steven Cordwell committed
838
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
839
            if variation < self.thresh:
840
841
842
843
                done = True
            else:
                is_verbose = False
                if self.verbose:
844
                    self.setSilent()
845
846
                    is_verbose = True
                
847
                self._evalPolicyIterative(self.V, self.epsilon, self.max_iter)
848
849
                
                if is_verbose:
850
                    self.setVerbose()
851
        
852
        self.time = time() - self.time
853
854
        
        # store value and policy as tuples
855
856
        self.V = tuple(self.V.tolist())
        self.policy = tuple(self.policy.tolist())
Steven Cordwell's avatar
Steven Cordwell committed
857
858

class QLearning(MDP):
859
    
860
    """A discounted MDP solved using the Q learning algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
876
877
        Default value = 10000; it is an integer greater than the default
        value.
Steven Cordwell's avatar
Steven Cordwell committed
878
879
880
881
882
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
883
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
884
885
886
887
888
889
890
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

891
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
892
    ---------
893
894
895
    >>> # These examples are reproducible only if random seed is set to 0 in
    >>> # both the random and numpy.random modules.
    >>> import numpy as np
896
    >>> import mdptoolbox, mdptoolbox.example
897
    >>> np.random.seed(0)
898
899
    >>> P, R = mdptoolbox.example.forest()
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.96)
900
    >>> ql.run()
Steven Cordwell's avatar
Steven Cordwell committed
901
    >>> ql.Q
902
903
904
    array([[ 68.38037354,  43.24888454],
           [ 72.37777922,  42.75549145],
           [ 77.02892702,  64.68712932]])
Steven Cordwell's avatar
Steven Cordwell committed
905
    >>> ql.V
906
    (68.38037354422798, 72.37777921607258, 77.02892701616531)
Steven Cordwell's avatar
Steven Cordwell committed
907
    >>> ql.policy
908
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
909
    
910
    >>> import mdptoolbox
Steven Cordwell's avatar
Steven Cordwell committed
911
912
913
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
914
    >>> np.random.seed(0)
915
916
    >>> ql = mdptoolbox.mdp.QLearning(P, R, 0.9)
    >>> ql.run()
Steven Cordwell's avatar
Steven Cordwell committed
917
    >>> ql.Q
918
919
    array([[ 39.933691  ,  43.17543338],
           [ 36.94394224,  35.42568056]])
Steven Cordwell's avatar
Steven Cordwell committed
920
    >>> ql.V
921
    (43.17543338090149, 36.943942243204454)
Steven Cordwell's avatar
Steven Cordwell committed
922
    >>> ql.policy
923
    (1, 0)
924
    
Steven Cordwell's avatar
Steven Cordwell committed
925
926
927
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
928
        # Initialise a Q-learning MDP.
Steven Cordwell's avatar
Steven Cordwell committed
929
        
930
931
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
932
933
934
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "PyMDPtoolbox: n_iter should be " \
                                        "greater than 10000."
Steven Cordwell's avatar
Steven Cordwell committed
935
        
936
        # We don't want to send this to MDP because _computePR should not be
937
        # run on it, so check that it defines an MDP
938
939
        check(transitions, reward)
        
940
941
        # Store P, S, and A
        self._computeP(transitions)
942
943
944
945
946
        
        self.R = reward
        
        self.discount = discount
        
Steven Cordwell's avatar
Steven Cordwell committed
947
948
949
950
        # Initialisations
        self.Q = zeros((self.S, self.A))
        self.mean_discrepancy = []
        
951
        # Call the iteration method
952
        #self.run()
953
        
954
    def run(self):
955
        # Run the Q-learning algoritm.
956
957
        discrepancy = []
        
Steven Cordwell's avatar
Steven Cordwell committed
958
959
960
        self.time = time()
        
        # initial state choice
961
        s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
962
        
963
        for n in range(1, self.max_iter + 1):
Steven Cordwell's avatar
Steven Cordwell committed
964
965
966
            
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
967
                s = randint(0, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
968
969
970
971
972
            
            # Action choice : greedy with increasing probability
            # probability 1-(1/log(n+2)) can be changed
            pn = random()
            if (pn < (1 - (1 / log(n + 2)))):
Steven Cordwell's avatar
Steven Cordwell committed
973
974
                # optimal_action = self.Q[s, :].max()
                a = self.Q[s, :].argmax()
Steven Cordwell's avatar
Steven Cordwell committed
975
            else:
976
                a = randint(0, self.A)
Steven Cordwell's avatar
Steven Cordwell committed
977
            
Steven Cordwell's avatar
Steven Cordwell committed
978
979
980
            # Simulating next state s_new and reward associated to <s,s_new,a>
            p_s_new = random()
            p = 0
Steven Cordwell's avatar
Steven Cordwell committed
981
            s_new = -1
982
            while ((p < p_s_new) and (s_new < (self.S - 1))):
Steven Cordwell's avatar
Steven Cordwell committed
983
                s_new = s_new + 1
Steven Cordwell's avatar
Steven Cordwell committed
984
985
                p = p + self.P[a][s, s_new]
            
986