mdp.py 51.8 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1 2
# -*- coding: utf-8 -*-
"""
3
Copyright (c) 2011, 2012, 2013 Steven Cordwell
Steven Cordwell's avatar
Steven Cordwell committed
4 5 6 7
Copyright (c) 2009, Iadine Chadès
Copyright (c) 2009, Marie-Josée Cros
Copyright (c) 2009, Frédérick Garcia
Copyright (c) 2009, Régis Sabbadin
Steven Cordwell's avatar
Steven Cordwell committed
8 9 10

All rights reserved.

Steven Cordwell's avatar
Steven Cordwell committed
11 12
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Steven Cordwell's avatar
Steven Cordwell committed
13 14 15 16 17 18

  * Redistributions of source code must retain the above copyright notice, this
    list of conditions and the following disclaimer.
  * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.
Steven Cordwell's avatar
Steven Cordwell committed
19 20 21
  * Neither the name of the <ORGANIZATION> nor the names of its contributors
    may be used to endorse or promote products derived from this software
    without specific prior written permission.
Steven Cordwell's avatar
Steven Cordwell committed
22 23 24 25

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Steven Cordwell's avatar
Steven Cordwell committed
26 27 28 29 30 31 32
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Steven Cordwell's avatar
Steven Cordwell committed
33 34
"""

Steven Cordwell's avatar
Steven Cordwell committed
35
from numpy import absolute, array, diag, matrix, mean, ndarray, ones, zeros
Steven Cordwell's avatar
Steven Cordwell committed
36
from numpy.random import rand
Steven Cordwell's avatar
Steven Cordwell committed
37
from numpy.random import randint as randi
Steven Cordwell's avatar
Steven Cordwell committed
38
from math import ceil, log, sqrt
Steven Cordwell's avatar
Steven Cordwell committed
39 40
from random import randint, random
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
41 42
from time import time

Steven Cordwell's avatar
Steven Cordwell committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
63
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
64 65 66 67 68 69 70 71 72 73
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
74
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
75
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
76
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
77 78 79 80 81 82 83 84 85 86
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
    "(S, A) with S : number of states greater than 0 and A : number of actions "
    "greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
87 88 89 90 91
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
}

def exampleForest(S=3, r1=4, r2=2, p=0.1):
    """
    Generates a Markov Decision Process example based on a simple forest
    management.
    
    See the related documentation for more detail.
    
    Parameters
    ---------
    S : number of states (> 0), optional (default 3)
    r1 : reward when forest is in the oldest state and action Wait is performed,
        optional (default 4)
    r2 : reward when forest is in the oldest state and action Cut is performed, 
        optional (default 2)
    p : probability of wild fire occurence, in ]0, 1[, optional (default 0.1)
    
    Evaluation
    ----------
    P : transition probability matrix (A, S, S)
    R : reward matrix (S, A)
Steven Cordwell's avatar
Steven Cordwell committed
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],

           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    
Steven Cordwell's avatar
Steven Cordwell committed
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
    """
    if (S <= 1):
        raise ValueError(mdperr["S_gt_1"])
    if (r1 <= 0) or (r2 <= 0):
        raise ValueError(mdperr["R_gt_0"])
    if (p < 0 or p > 1):
        raise ValueError(mdperr["prob_in01"])
    
    # Definition of Transition matrix P(:,:,1) associated to action Wait (action 1) and
    # P(:,:,2) associated to action Cut (action 2)
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
    P = zeros((2, S, S))
    P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
    P[0, :, 0] = p
    P[0, S - 1, S - 1] = (1 - p)
    P[1, :, :] = zeros((S, S))
    P[1, :, 0] = 1
    
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    
    return (P, R)

def exampleRand(S, A, is_sparse=False, mask=None):
    """Generates a random Markov Decision Process.
    
    Parameters
    ----------
    S : number of states (> 0)
    A : number of actions (> 0)
    is_sparse : false to have matrices in plain format, true to have sparse
        matrices optional (default false).
    mask : matrix with 0 and 1 (0 indicates a place for a zero
Steven Cordwell's avatar
Steven Cordwell committed
181
           probability), optional (SxS) (default, random)
Steven Cordwell's avatar
Steven Cordwell committed
182 183 184 185 186
    
    Returns
    ----------
    P : transition probability matrix (SxSxA)
    R : reward matrix (SxSxA)
Steven Cordwell's avatar
Steven Cordwell committed
187 188 189 190 191 192

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)

Steven Cordwell's avatar
Steven Cordwell committed
193 194 195 196 197 198 199 200 201 202 203
    """
    if (S < 1 or A < 1):
        raise ValueError(mdperr["SA_gt_1"])
    
    try:
        if (mask != None) and ((mask.shape[0] != S) or (mask.shape[1] != S)):
            raise ValueError(mdperr["mask_SbyS"])
    except AttributeError:
        raise TypeError(mdperr["mask_numpy"])
    
    if mask == None:
Steven Cordwell's avatar
Steven Cordwell committed
204 205 206 207 208
        mask = rand(A, S, S)
        for a in range(A):
            r = random()
            mask[a][mask[a] < r] = 0
            mask[a][mask[a] >= r] = 1
Steven Cordwell's avatar
Steven Cordwell committed
209 210 211
    
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
Steven Cordwell's avatar
Steven Cordwell committed
212
        P = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
213
        # definition of reward matrix (values between -1 and +1)
Steven Cordwell's avatar
Steven Cordwell committed
214
        R = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
215
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
216
            PP = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
217
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
218 219
                if (mask[a, s, :].sum() == 0):
                    PP[s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
220 221 222 223 224 225 226 227 228
                PP[s, :] = PP[s, :] / PP[s, :].sum()
            P[a] = sparse(PP)
            R[a] = sparse(mask * (2 * rand(S, S) - ones((S, S))))
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
229
            P[a, :, :] = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
230
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
231 232
                if (mask[a, s, :].sum() == 0):
                    P[a, s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
233
                P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
Steven Cordwell's avatar
Steven Cordwell committed
234
            R[a, :, :] = mask[a] * (2 * rand(S, S) - ones((S, S), dtype=int))
Steven Cordwell's avatar
Steven Cordwell committed
235 236 237
    
    return (P, R)

Steven Cordwell's avatar
Steven Cordwell committed
238
class MDP(object):
Steven Cordwell's avatar
Steven Cordwell committed
239
    """The Markov Decision Problem Toolbox."""
Steven Cordwell's avatar
Steven Cordwell committed
240
    
241
    def __init__(self, transitions, reward, discount, max_iter):
Steven Cordwell's avatar
Steven Cordwell committed
242
        """"""
243
        
Steven Cordwell's avatar
Steven Cordwell committed
244 245 246 247 248 249 250
        if (type(discount) is int) or (type(discount) is float):
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
                self.discount = discount
        elif not discount is None:
            raise ValueError("PyMDPtoolbox: the discount must be a positive real number less than or equal to one.")
251
        
Steven Cordwell's avatar
Steven Cordwell committed
252 253 254 255 256 257 258
        if (type(max_iter) is int) or (type(max_iter) is float):
            if (max_iter <= 0):
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
        elif not max_iter is None:
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real number greater than zero.")
259 260 261 262 263
        
        self.check(transitions, reward)
        
        self.computePR(transitions, reward)
        
Steven Cordwell's avatar
Steven Cordwell committed
264 265 266 267
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
268 269
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
270 271 272
        
        self.value = None
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
273
    
Steven Cordwell's avatar
Steven Cordwell committed
274
    def bellmanOperator(self):
Steven Cordwell's avatar
Steven Cordwell committed
275 276
        """
        Applies the Bellman operator on the value function.
Steven Cordwell's avatar
Steven Cordwell committed
277
        
Steven Cordwell's avatar
Steven Cordwell committed
278 279
        Updates the value function and the Vprev-improving policy.
        
280
        Returns
Steven Cordwell's avatar
Steven Cordwell committed
281
        -------
282
        (policy, value) : tuple of new policy and its value
Steven Cordwell's avatar
Steven Cordwell committed
283 284 285
        """
        Q = matrix(zeros((self.S, self.A)))
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
286
            Q[:, aa] = self.R[:, aa] + (self.discount * self.P[aa] * self.value)
Steven Cordwell's avatar
Steven Cordwell committed
287
        
288 289 290 291 292
        # Which way is better? if choose the first way, then the classes that
        # call this function must be changed
        # 1. Return, (policy, value)
        # return (Q.argmax(axis=1), Q.max(axis=1))
        # 2. change self.policy and self.value directly
Steven Cordwell's avatar
Steven Cordwell committed
293 294 295 296
        self.value = Q.max(axis=1)
        self.policy = Q.argmax(axis=1)
    
    def check(self, P, R):
Steven Cordwell's avatar
Steven Cordwell committed
297
        """Checks if the matrices P and R define a Markov Decision Process.
Steven Cordwell's avatar
Steven Cordwell committed
298
        
Steven Cordwell's avatar
Steven Cordwell committed
299 300 301 302 303
        Let S = number of states, A = number of actions.
        The transition matrix P must be on the shape (A, S, S) and P[a,:,:]
        must be stochastic.
        The reward matrix R must be on the shape (A, S, S) or (S, A).
        Raises an error if P and R do not define a MDP.
Steven Cordwell's avatar
Steven Cordwell committed
304 305 306 307 308 309 310 311 312 313
        
        Parameters
        ---------
        P : transition matrix (A, S, S)
            P could be an array with 3 dimensions or a object array (A, ),
            each cell containing a matrix (S, S) possibly sparse
        R : reward matrix (A, S, S) or (S, A)
            R could be an array with 3 dimensions (SxSxA) or a object array
            (A, ), each cell containing a sparse matrix (S, S) or a 2D
            array(S, A) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
314 315 316 317 318 319 320 321 322
        """
        
        # Check of P
        # tranitions must be a numpy array either an AxSxS ndarray (with any 
        # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, 
        # and each element containing an SxS array. An AxSxS array will be
        # be converted to an object array. A numpy object array is similar to a
        # MATLAB cell array.
        if (not type(P) is ndarray):
Steven Cordwell's avatar
Steven Cordwell committed
323
            raise TypeError(mdperr["P_type"])
Steven Cordwell's avatar
Steven Cordwell committed
324
        
Steven Cordwell's avatar
Steven Cordwell committed
325
        if (not type(R) is ndarray):
Steven Cordwell's avatar
Steven Cordwell committed
326
            raise TypeError(mdperr["R_type"])
Steven Cordwell's avatar
Steven Cordwell committed
327 328 329 330 331 332 333 334 335 336 337
        
        # NumPy has an array type of 'object', which is roughly equivalent to
        # the MATLAB cell array. These are most useful for storing sparse
        # matrices as these can only have two dimensions whereas we want to be
        # able to store a transition matrix for each action. If the dytpe of
        # the transition probability array is object then we store this as
        # P_is_object = True.
        # If it is an object array, then it should only have one dimension
        # otherwise fail with a message expalining why.
        # If it is a normal array then the number of dimensions must be exactly
        # three, otherwise fail with a message explaining why.
Steven Cordwell's avatar
Steven Cordwell committed
338 339 340 341 342
        if (P.dtype == object):
            if (P.ndim > 1):
                raise ValueError(mdperr["obj_shape"])
            else:
                P_is_object = True
Steven Cordwell's avatar
Steven Cordwell committed
343
        else:
Steven Cordwell's avatar
Steven Cordwell committed
344 345 346 347
            if (P.ndim != 3):
                raise ValueError(mdperr["P_shape"])
            else:
                P_is_object = False
Steven Cordwell's avatar
Steven Cordwell committed
348 349 350
        
        # As above but for the reward array. A difference is that the reward
        # array can have either two or 3 dimensions.
Steven Cordwell's avatar
Steven Cordwell committed
351 352 353 354 355
        if (R.dtype == object):
            if (R.ndim > 1):
                raise ValueError(mdperr["obj_shape"])
            else:
                R_is_object = True
Steven Cordwell's avatar
Steven Cordwell committed
356
        else:
Steven Cordwell's avatar
Steven Cordwell committed
357 358 359 360
            if (not R.ndim in (2, 3)):
                raise ValueError(mdperr["R_shape"])
            else:
                R_is_object = False
Steven Cordwell's avatar
Steven Cordwell committed
361
        
Steven Cordwell's avatar
Steven Cordwell committed
362 363 364 365 366
        # We want to make sure that the transition probability array and the 
        # reward array are in agreement. This means that both should show that
        # there are the same number of actions and the same number of states.
        # Furthermore the probability of transition matrices must be SxS in
        # shape, so we check for that also.
Steven Cordwell's avatar
Steven Cordwell committed
367
        if P_is_object:
Steven Cordwell's avatar
Steven Cordwell committed
368 369 370 371 372 373 374 375 376
            # If the user has put their transition matrices into a numpy array
            # with dtype of 'object', then it is possible that they have made a
            # mistake and not all of the matrices are of the same shape. So,
            # here we record the number of actions and states that the first
            # matrix in element zero of the object array says it has. After
            # that we check that every other matrix also reports the same
            # number of actions and states, otherwise fail with an error.
            # aP: the number of actions in the transition array. This
            # corresponds to the number of elements in the object array.
Steven Cordwell's avatar
Steven Cordwell committed
377
            aP = P.shape[0]
Steven Cordwell's avatar
Steven Cordwell committed
378 379 380 381 382 383 384
            # sP0: the number of states as reported by the number of rows of
            # the transition matrix
            # sP1: the number of states as reported by the number of columns of
            # the transition matrix
            sP0, sP1 = P[0].shape
            # Now we check to see that every element of the object array holds
            # a matrix of the same shape, otherwise fail.
Steven Cordwell's avatar
Steven Cordwell committed
385
            for aa in range(1, aP):
Steven Cordwell's avatar
Steven Cordwell committed
386 387 388 389 390
                # sp0aa and sp1aa represents the number of states in each
                # subsequent element of the object array. If it doesn't match
                # what was found in the first element, then we need to fail
                # telling the user what needs to be fixed.
                sP0aa, sP1aa = P[aa].shape
Steven Cordwell's avatar
Steven Cordwell committed
391
                if ((sP0aa != sP0) or (sP1aa != sP1)):
Steven Cordwell's avatar
Steven Cordwell committed
392
                    raise ValueError(mdperr["obj_square"])
Steven Cordwell's avatar
Steven Cordwell committed
393
        else:
Steven Cordwell's avatar
Steven Cordwell committed
394 395 396
            # if we are using a normal array for this, then the first
            # dimension should be the number of actions, and the second and 
            # third should be the number of states
Steven Cordwell's avatar
Steven Cordwell committed
397 398
            aP, sP0, sP1 = P.shape
        
Steven Cordwell's avatar
Steven Cordwell committed
399 400 401 402 403 404
        # the first dimension of the transition matrix must report the same
        # number of states as the second dimension. If not then we are not
        # dealing with a square matrix and it is not a valid transition
        # probability. Also, if the number of actions is less than one, or the
        # number of states is less than one, then it also is not a valid
        # transition probability.
Steven Cordwell's avatar
Steven Cordwell committed
405
        if ((sP0 < 1) or (aP < 1) or (sP0 != sP1)):
Steven Cordwell's avatar
Steven Cordwell committed
406 407
            raise ValueError(mdperr["P_shape"])
        
Steven Cordwell's avatar
Steven Cordwell committed
408 409 410
        # now we check that each transition matrix is square-stochastic. For
        # object arrays this is the matrix held in each element, but for
        # normal arrays this is a matrix formed by taking a slice of the array
Steven Cordwell's avatar
Steven Cordwell committed
411 412 413
        for aa in range(aP):
            if P_is_object:
                self.checkSquareStochastic(P[aa])
Steven Cordwell's avatar
Steven Cordwell committed
414
            else:
Steven Cordwell's avatar
Steven Cordwell committed
415
                self.checkSquareStochastic(P[aa, :, :])
Steven Cordwell's avatar
Steven Cordwell committed
416
            # aa = aa + 1 # why was this here?
Steven Cordwell's avatar
Steven Cordwell committed
417 418
        
        if R_is_object:
Steven Cordwell's avatar
Steven Cordwell committed
419 420 421
            # if the rewarad array has an object dtype, then we check that
            # each element contains a matrix of the same shape as we did 
            # above with the transition array.
Steven Cordwell's avatar
Steven Cordwell committed
422
            aR = R.shape[0]
Steven Cordwell's avatar
Steven Cordwell committed
423
            sR0, sR1 = R[0].shape
Steven Cordwell's avatar
Steven Cordwell committed
424
            for aa in range(1, aR):
Steven Cordwell's avatar
Steven Cordwell committed
425
                sR0aa, sR1aa = R[aa].shape
Steven Cordwell's avatar
Steven Cordwell committed
426 427 428
                if ((sR0aa != sR0) or (sR1aa != sR1)):
                    raise ValueError(mdperr["obj_square"])
        elif (R.ndim == 3):
Steven Cordwell's avatar
Steven Cordwell committed
429 430 431
            # This indicates that the reward matrices are constructed per 
            # transition, so that the first dimension is the actions and
            # the second two dimensions are the states.
Steven Cordwell's avatar
Steven Cordwell committed
432 433
            aR, sR0, sR1 = R.shape
        else:
Steven Cordwell's avatar
Steven Cordwell committed
434 435
            # then the reward matrix is per state, so the first dimension is 
            # the states and the second dimension is the actions.
Steven Cordwell's avatar
Steven Cordwell committed
436
            sR0, aR = R.shape
Steven Cordwell's avatar
Steven Cordwell committed
437 438
            # this is added just so that the next check doesn't error out
            # saying that sR1 doesn't exist
Steven Cordwell's avatar
Steven Cordwell committed
439
            sR1 = sR0
Steven Cordwell's avatar
Steven Cordwell committed
440
        
Steven Cordwell's avatar
Steven Cordwell committed
441 442
        # the number of actions must be more than zero, the number of states
        # must also be more than 0, and the states must agree
Steven Cordwell's avatar
Steven Cordwell committed
443 444
        if ((sR0 < 1) or (aR < 1) or (sR0 != sR1)):
            raise ValueError(mdperr["R_shape"])
Steven Cordwell's avatar
Steven Cordwell committed
445 446 447 448
        
        # now we check to see that what the transition array is reporting and
        # what the reward arrar is reporting agree as to the number of actions
        # and states. If not then fail explaining the situation
Steven Cordwell's avatar
Steven Cordwell committed
449 450
        if (sP0 != sR0) or (aP != aR):
            raise ValueError(mdperr["PR_incompat"])
Steven Cordwell's avatar
Steven Cordwell committed
451 452 453 454
            
        # We are at the end of the checks, so if no exceptions have been raised
        # then that means there are (hopefullly) no errors and we return None
        return None
Steven Cordwell's avatar
Steven Cordwell committed
455 456 457 458
    
    def checkSquareStochastic(self, Z):
        """Check if Z is a square stochastic matrix
        
Steven Cordwell's avatar
Steven Cordwell committed
459 460 461 462 463
        Parameters
        ----------
        Z : a SxS matrix. It could be a numpy ndarray SxS, or a scipy.sparse 
            csr_matrix
        
Steven Cordwell's avatar
Steven Cordwell committed
464
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
465 466
        ----------
        Returns None if no error has been detected
Steven Cordwell's avatar
Steven Cordwell committed
467 468 469
        """
        s1, s2 = Z.shape
        if (s1 != s2):
Steven Cordwell's avatar
Steven Cordwell committed
470
           raise ValueError(mdperr["mat_square"])
Steven Cordwell's avatar
Steven Cordwell committed
471
        elif (absolute(Z.sum(axis=1) - ones(s2))).max() > 10**(-12):
Steven Cordwell's avatar
Steven Cordwell committed
472 473 474 475 476
            raise ValueError(mdperr["mat_stoch"])
        elif ((type(Z) is ndarray) or (type(Z) is matrix)) and (Z < 0).any():
            raise ValueError(mdperr["mat_nonneg"])
        elif (type(Z) is sparse) and (Z.data < 0).any():
            raise ValueError(mdperr["mat_nonneg"]) 
Steven Cordwell's avatar
Steven Cordwell committed
477 478
        else:
            return(None)
Steven Cordwell's avatar
Steven Cordwell committed
479
    
Steven Cordwell's avatar
Steven Cordwell committed
480 481 482
    def computePpolicyPRpolicy(self):
        """Computes the transition matrix and the reward matrix for a policy.
        """
483
        raise NotImplementedError("This method has not been implemented yet.")  
Steven Cordwell's avatar
Steven Cordwell committed
484 485 486 487
    
    def computePR(self, P, R):
        """Computes the reward for the system in one state chosing an action
        
Steven Cordwell's avatar
Steven Cordwell committed
488
        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
489
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
490 491 492 493 494 495 496 497
        Let S = number of states, A = number of actions
            P(SxSxA)  = transition matrix 
                P could be an array with 3 dimensions or  a cell array (1xA), 
                each cell containing a matrix (SxS) possibly sparse
            R(SxSxA) or (SxA) = reward matrix
                R could be an array with 3 dimensions (SxSxA) or  a cell array 
                (1xA), each cell containing a sparse matrix (SxS) or a 2D 
                array(SxA) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
498
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
499
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
            PR(SxA)   = reward matrix
        """
        # make P be an object array with (S, S) shaped array elements
        if (P.dtype == object):
            self.P = P
            self.A = self.P.shape[0]
            self.S = self.P[0].shape[0]
        else: # convert to an object array
            self.A = P.shape[0]
            self.S = P.shape[1]
            self.P = zeros(self.A, dtype=object)
            for aa in range(self.A):
                self.P[aa] = P[aa, :, :]
        
        # make R have the shape (S, A)
        if ((R.ndim == 2) and (not R.dtype is object)):
            # R already has shape (S, A)
            self.R = R
        else: 
            # R has shape (A, S, S) or object shaped (A,) with each element
            # shaped (S, S)
            self.R = zeros((self.S, self.A))
            if (R.dtype is object):
                for aa in range(self.A):
                    self.R[:, aa] = sum(P[aa] * R[aa], 2)
            else:
                for aa in range(self.A):
                    self.R[:, aa] = sum(P[aa] * R[aa, :, :], 2)
        
        # convert the arrays to numpy matrices
        for aa in range(self.A):
            if (type(self.P[aa]) is ndarray):
                self.P[aa] = matrix(self.P[aa])
        if (type(self.R) is ndarray):
            self.R = matrix(self.R)
535 536 537 538 539

    def iterate(self):
        """This is a placeholder method. Child classes should define their own
        iterate() method.
        """
Steven Cordwell's avatar
Steven Cordwell committed
540
        raise NotImplementedError("You should create an iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
541
    
Steven Cordwell's avatar
Steven Cordwell committed
542
    def getSpan(self, W):
Steven Cordwell's avatar
Steven Cordwell committed
543 544 545 546 547
        """Returns the span of W
        
        sp(W) = max W(s) - min W(s)
        """
        return (W.max() - W.min())
Steven Cordwell's avatar
Steven Cordwell committed
548
    
Steven Cordwell's avatar
Steven Cordwell committed
549 550 551 552 553 554 555
    def setSilent(self):
        """Ask for running resolution functions of the MDP Toolbox in silent
        mode.
        """
        self.verbose = False
    
    def setVerbose(self):
Steven Cordwell's avatar
Steven Cordwell committed
556 557 558
        """Ask for running resolution functions of the MDP Toolbox in verbose
        mode.
        """
Steven Cordwell's avatar
Steven Cordwell committed
559
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
560 561

class FiniteHorizon(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
    """Reolution of finite-horizon MDP with backwards induction
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    Evaluation
    ----------
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.

Steven Cordwell's avatar
Steven Cordwell committed
593
    """
Steven Cordwell's avatar
Steven Cordwell committed
594

Steven Cordwell's avatar
Steven Cordwell committed
595 596
    def __init__(self, transitions, reward, discount, N, h=None):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
597 598
        if N < 1:
            raise ValueError('MDP Toolbox ERROR: N must be upper than 0')
Steven Cordwell's avatar
Steven Cordwell committed
599
        else:
Steven Cordwell's avatar
Steven Cordwell committed
600
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
601
        
Steven Cordwell's avatar
Steven Cordwell committed
602
        MDP.__init__(self, transitions, reward, discount, None)
Steven Cordwell's avatar
Steven Cordwell committed
603
        
Steven Cordwell's avatar
Steven Cordwell committed
604
        self.value = zeros(self.S, N + 1)
Steven Cordwell's avatar
Steven Cordwell committed
605
        
Steven Cordwell's avatar
Steven Cordwell committed
606 607
        if not h is None:
            self.value[:, N + 1] = h
Steven Cordwell's avatar
Steven Cordwell committed
608
        
Steven Cordwell's avatar
Steven Cordwell committed
609 610
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
611 612
        self.time = time()
        
Steven Cordwell's avatar
Steven Cordwell committed
613 614 615 616 617 618
        for n in range(self.N - 1):
            W, X = self.bellmanOperator(self.P, self.R, self.discount, self.value[:, self.N - n + 1])
            self.value[:, self.N - n] = W
            self.policy[:, self.N - n] = X
            if self.verbose:
                print("stage: %s ... policy transpose : %s") % (self.N - n, self.policy[:, self.N - n].T)
Steven Cordwell's avatar
Steven Cordwell committed
619 620
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
621 622

class LP(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    """Resolution of discounted MDP with linear programming

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
650
    """
Steven Cordwell's avatar
Steven Cordwell committed
651

Steven Cordwell's avatar
Steven Cordwell committed
652 653
    def __init__(self, transitions, reward, discount):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
654 655 656
        
        try:
            from cvxopt import matrix, solvers
Steven Cordwell's avatar
Steven Cordwell committed
657
            self.linprog = solvers.lp
Steven Cordwell's avatar
Steven Cordwell committed
658 659 660
        except ImportError:
            raise ImportError("The python module cvxopt is required to use linear programming functionality.")
        
Steven Cordwell's avatar
Steven Cordwell committed
661
        from scipy.sparse import eye as speye
Steven Cordwell's avatar
Steven Cordwell committed
662

Steven Cordwell's avatar
Steven Cordwell committed
663
        MDP.__init__(self, transitions, reward, discount, None)
Steven Cordwell's avatar
Steven Cordwell committed
664 665 666 667 668 669 670
        
        # The objective is to resolve : min V / V >= PR + discount*P*V
        # The function linprog of the optimisation Toolbox of Mathworks resolves :
        # min f'* x / M * x <= b
        # So the objective could be expressed as : min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions M(A*S,S)
    
Steven Cordwell's avatar
Steven Cordwell committed
671
        self.f = ones(self.S, 1)
Steven Cordwell's avatar
Steven Cordwell committed
672
    
Steven Cordwell's avatar
Steven Cordwell committed
673 674 675 676
        self.M = zeros((self.A * self.S, self.S))
        for aa in range(self.A):
            pos = (aa + 1) * self.S
            self.M[(pos - self.S):pos, :] = discount * self.P[aa] - speye(self.S, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
677
        
Steven Cordwell's avatar
Steven Cordwell committed
678 679 680 681
        self.M = matrix(self.M)
    
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
682 683
        self.time = time()
        
Steven Cordwell's avatar
Steven Cordwell committed
684
        self.value = self.linprog(self.f, self.M, -self.R)
Steven Cordwell's avatar
Steven Cordwell committed
685
        
Steven Cordwell's avatar
Steven Cordwell committed
686
        self.value, self.policy =  self.bellmanOperator(self.P, self.R, self.discount, self.value)
Steven Cordwell's avatar
Steven Cordwell committed
687 688
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
689 690 691

class PolicyIteration(MDP):
    """Resolution of discounted MDP with policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
692 693 694 695 696 697 698 699
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    >>> pi = mdp.PolicyIteration(P, R, 0.9)
    >>> pi.iterate()
    
Steven Cordwell's avatar
Steven Cordwell committed
700
    """
Steven Cordwell's avatar
Steven Cordwell committed
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
    
    def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0):
        """"""
        MDP.__init__(self)
        
        self.check(transitions, reward)
        
        self.S = transitions.shape[1]
        
        self.A = transitions.shape[0]
        
        self.P = transitions
        
        self.R = reward
        
        #self.computePR(transitions, reward)
        
        if (initial_value == 0):
            self.value = zeros((self.S))
            #self.value = matrix(zeros((self.S, 1)))
        else:
            if (len(initial_value) != self.S):
Steven Cordwell's avatar
Steven Cordwell committed
723
                raise ValueError("The initial value must be length S")
Steven Cordwell's avatar
Steven Cordwell committed
724 725 726 727 728 729
            
            self.value = matrix(initial_value)
        
        self.policy = randi(0, self.A, self.S)

        self.discount = discount
Steven Cordwell's avatar
Steven Cordwell committed
730
        self.max_iter = max_iter
Steven Cordwell's avatar
Steven Cordwell committed
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
        
        self.iter = 0
    
    def iterate(self):
        """"""
        done = False
        stop_criterion = 0.01
        
        while not done:
            stop = False
            while not stop:
                change = 0
                for s in range(self.S):
                    v = self.value[s]
                    a = self.policy[s]
                    self.value[s] = (self.P[a, s, :] * (self.R[a, s, :] +
                        (self.discount * self.value))).sum()
                    change = max(change, abs(v - self.value[s]))
                
                if change < stop_criterion:
                    stop = True
            
            policy_stable = True
            for s in range(self.S):
                b = self.policy[s]
                self.policy[s] = (self.P[:, s, :] * (self.R[:, s, :] +
                        (self.discount * self.value))).sum(1).argmax()
                if b !=  self.policy[s]:
                    policy_stable = False
            
            if policy_stable:
                done = True
        
        # store value and policy as tuples
        self.value = tuple(array(self.value).reshape(self.S).tolist())
        self.policy = tuple(array(self.policy).reshape(self.S).tolist())
Steven Cordwell's avatar
Steven Cordwell committed
767 768

class PolicyIterationModified(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
    """Resolution of discounted MDP  with policy iteration algorithm
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
    >>> import mdp
Steven Cordwell's avatar
Steven Cordwell committed
804
    """
805 806
    
    def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10):
Steven Cordwell's avatar
Steven Cordwell committed
807 808
        """"""
        
Steven Cordwell's avatar
Steven Cordwell committed
809
        MDP.__init__(self, transitions, reward, discount, max_iter)
810 811 812 813 814 815 816 817 818 819 820 821 822 823
        
        if epsilon <= 0:
            raise ValueError("epsilon must be greater than 0")
        
        # computation of threshold of variation for V for an epsilon-optimal policy
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
        if discount == 1:
            self.value = matrix(zeros((self.S, 1)))
        else:
            # min(min()) is not right
Steven Cordwell's avatar
Steven Cordwell committed
824 825 826 827 828
            self.value = 1 / (1 - discount) * min(min(self.R)) * ones((self.S, 1))
    
    def evalPolicyIterative(self):
        """"""
        pass
829 830 831 832 833 834 835
    
    def iterate(self):
        """"""
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
836
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
837
        
838 839 840
        done = False
        while not done:
            self.iter = self.iter + 1
841
            
842 843
            Vnext, policy = self.bellmanOperator(self.P, self.PR, self.discount, self.V)
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
844
            
Steven Cordwell's avatar
Steven Cordwell committed
845
            variation = self.getSpan(Vnext - self.value);
846 847 848
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
849 850
            self.value = Vnext
            if variation < self.thresh:
851 852 853 854
                done = True
            else:
                is_verbose = False
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
855
                    self.setSilent
856 857
                    is_verbose = True
                
Steven Cordwell's avatar
Steven Cordwell committed
858
                self.value = self.evalPolicyIterative()
859 860
                
                if is_verbose:
Steven Cordwell's avatar
Steven Cordwell committed
861
                    self.setVerbose
862
        
863
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
864 865

class QLearning(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
    """Evaluates the matrix Q, using the Q learning algorithm.
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
        Default value = 10000; it is an integer greater than the default value.
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
    value : learned value function (S).
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

896
    ExamplesPP[:, aa] = self.P[aa][:, ss]
Steven Cordwell's avatar
Steven Cordwell committed
897 898 899 900 901 902 903
    ---------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> ql = mdp.QLearning(P, R, 0.96)
    >>> ql.iterate()
    >>> ql.Q
    array([[  0.        ,   0.        ],
Steven Cordwell's avatar
Steven Cordwell committed
904 905
           [  0.01062959,   0.79870231],
           [ 10.08191776,   0.35309404]])
Steven Cordwell's avatar
Steven Cordwell committed
906
    >>> ql.value
Steven Cordwell's avatar
Steven Cordwell committed
907
    array([  0.        ,   0.79870231,  10.08191776])
Steven Cordwell's avatar
Steven Cordwell committed
908 909 910 911 912 913 914 915 916 917
    >>> ql.policy
    array([0, 1, 0])
    
    >>> import mdp
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
    >>> ql = mdp.QLearning(P, R, 0.9)
    >>> ql.iterate()
    >>> ql.Q
Steven Cordwell's avatar
Steven Cordwell committed
918 919
    array([[ 94.99525115,  99.99999007],
           [ 53.92930199,   5.57331205]])
Steven Cordwell's avatar
Steven Cordwell committed
920
    >>> ql.value
Steven Cordwell's avatar
Steven Cordwell committed
921
    array([ 99.99999007,  53.92930199])
Steven Cordwell's avatar
Steven Cordwell committed
922
    >>> ql.policy
Steven Cordwell's avatar
Steven Cordwell committed
923 924 925
    array([1, 0])
    >>> ql.time
    0.6501460075378418
Steven Cordwell's avatar
Steven Cordwell committed
926
    
Steven Cordwell's avatar
Steven Cordwell committed
927 928 929 930 931 932
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
        """Evaluation of the matrix Q, using the Q learning algorithm
        """
        
933 934 935 936
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        if (n_iter < 10000):
            raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000")
Steven Cordwell's avatar
Steven Cordwell committed
937
        
938 939
        # after this n_iter will be known as self.max_iter
        MDP.__init__(self, transitions, reward, discount, n_iter)
Steven Cordwell's avatar
Steven Cordwell committed
940 941 942 943 944
        
        # Initialisations
        self.Q = zeros((self.S, self.A))
        #self.dQ = zeros(self.S, self.A)
        self.mean_discrepancy = []
Steven Cordwell's avatar
Steven Cordwell committed
945
        self.discrepancy = []
Steven Cordwell's avatar
Steven Cordwell committed
946 947 948 949 950 951 952
        
    def iterate(self):
        """
        """
        self.time = time()
        
        # initial state choice
Steven Cordwell's avatar
Steven Cordwell committed
953
        # s = randint(0, self.S - 1)
Steven Cordwell's avatar
Steven Cordwell committed
954
        
955
        for n in range(self.max_iter):
Steven Cordwell's avatar
Steven Cordwell committed
956 957 958
            
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
Steven Cordwell's avatar
Steven Cordwell committed
959
                s = randint(0, self.S - 1)
Steven Cordwell's avatar
Steven Cordwell committed
960 961 962 963 964
            
            # Action choice : greedy with increasing probability
            # probability 1-(1/log(n+2)) can be changed
            pn = random()
            if (pn < (1 - (1 / log(n + 2)))):
Steven Cordwell's avatar
Steven Cordwell committed
965 966
                # optimal_action = self.Q[s, :].max()
                a = self.Q[s, :].argmax()
Steven Cordwell's avatar
Steven Cordwell committed
967 968
            else:
                a = randint(0, self.A - 1)
Steven Cordwell's avatar
Steven Cordwell committed
969
            
Steven Cordwell's avatar
Steven Cordwell committed
970 971 972
            # Simulating next state s_new and reward associated to <s,s_new,a>
            p_s_new = random()
            p = 0
Steven Cordwell's avatar
Steven Cordwell committed
973
            s_new = -1
Steven Cordwell's avatar
Steven Cordwell committed
974 975
            while ((p < p_s_new) and (s_new < s)):
                s_new = s_new + 1
Steven Cordwell's avatar
Steven Cordwell committed
976 977 978
                p = p + self.P[a][s, s_new]
            
            if (self.R.dtype == object):
Steven Cordwell's avatar
Steven Cordwell committed
979 980
                r = self.R[a][s, s_new]
            elif (self.R.ndim == 3):
Steven Cordwell's avatar
Steven Cordwell committed
981
                r = self.R[a, s, s_new]
Steven Cordwell's avatar
Steven Cordwell committed
982
            else:
Steven Cordwell's avatar
Steven Cordwell committed
983 984
                r = self.R[s, a]
            
Steven Cordwell's avatar
Steven Cordwell committed
985 986
            # Updating the value of Q   
            # Decaying update coefficient (1/sqrt(n+2)) can be changed
Steven Cordwell's avatar
Steven Cordwell committed
987
            delta = r + self.discount * self.Q[s_new, :].max() - self.Q[s, a]
Steven Cordwell's avatar
Steven Cordwell committed
988 989 990 991 992 993 994
            dQ = (1 / sqrt(n + 2)) * delta
            self.Q[s, a] = self.Q[s, a] + dQ
            
            # current state is updated
            s = s_new
            
            # Computing and saving maximal values of the Q variation
Steven Cordwell's avatar
Steven Cordwell committed
995
            self.discrepancy.append(absolute(dQ))
Steven Cordwell's avatar
Steven Cordwell committed
996 997 998
            
            # Computing means all over maximal Q variations values
            if ((n % 100) == 99):
Steven Cordwell's avatar
Steven Cordwell committed
999 1000
                self.mean_discrepancy.append(mean(self.discrepancy))
                self.discrepancy = []
Steven Cordwell's avatar
Steven Cordwell committed
1001 1002 1003 1004 1005
            
            # compute the value function and the policy
            self.value = self.Q.max(axis=1)
            self.policy = self.Q.argmax(axis=1)
            
Steven Cordwell's avatar
Steven Cordwell committed
1006
        self.time = time() - self.time
1007 1008 1009 1010
        
        # rather than report that we have not done any iterations, assign the
        # value of n_iter to self.iter
        self.iter = self.max_iter
Steven Cordwell's avatar
Steven Cordwell committed
1011 1012

class RelativeValueIteration(MDP):
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
    """Resolution of MDP with average reward with relative value iteration 
    algorithm 
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    epsilon  = epsilon-optimal policy search, upper than 0, 
             optional (default: 0.01)
    max_iter = maximum number of iteration to be done, upper than 0,
             optional (default 1000)
    
    Evaluation
    ----------
    policy(S)       = epsilon-optimal policy
    average_reward  = average reward of the optimal policy
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the span of U variation
    and the condition which stopped iterations : epsilon-optimum policy found
    or maximum number of iterations reached.
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
1045
    """
1046 1047 1048 1049 1050 1051 1052 1053
    
    def __init__(self, transitions, reward, epsilon=0.01, max_iter=1000):
        
        MDP.__init__(self,  transitions, reward, None, max_iter)
        
        if epsilon <= 0:
            print('MDP Toolbox ERROR: epsilon must be upper than 0')
    
Steven Cordwell's avatar
Steven Cordwell committed
1054 1055
        self.U = zeros(self.S, 1)
        self.gain = self.U[self.S]
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
    
    def iterate(self):
        """"""
        
        done = False
        if self.verbose:
            print('  Iteration  U_variation')
        
        self.time = time()
        
        while not done:
            
            self.iter = self.iter + 1;
            
Steven Cordwell's avatar
Steven Cordwell committed
1070
            Unext, policy = self.bellmanOperator(self.P, self.R, 1, self.U)
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
            Unext = Unext - self.gain
            
            variation = self.getSpan(Unext - self.U)
            
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))

            if variation < self.epsilon:
                 done = True
                 average_reward = self.gain + min(Unext - self.U)
                 if self.verbose:
                     print('MDP Toolbox : iterations stopped, epsilon-optimal policy found')
            elif self.iter == self.max_iter:
                 done = True 
                 average_reward = self.gain + min(Unext - self.U);
                 if self.verbose:
                     print('MDP Toolbox : iterations stopped by maximum number of iteration condition')
            
            self.U = Unext
            self.gain = self.U(self.S)
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
1093 1094 1095

class ValueIteration(MDP):
    """
Steven Cordwell's avatar
Steven Cordwell committed
1096 1097 1098 1099
    Solves discounted MDP with the value iteration algorithm.
    
    Description
    -----------
Steven Cordwell's avatar
Steven Cordwell committed
1100
    mdp.ValueIteration applies the value iteration algorithm to solve
Steven Cordwell's avatar
Steven Cordwell committed
1101
    discounted MDP. The algorithm consists in solving Bellman's equation
Steven Cordwell's avatar
Steven Cordwell committed
1102
    iteratively.
Steven Cordwell's avatar
Steven Cordwell committed
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
    Iterating is stopped when an epsilon-optimal policy is found or after a
    specified number (max_iter) of iterations. 
    This function uses verbose and silent modes. In verbose mode, the function
    displays the variation of V (value function) for each iteration and the
    condition which stopped iterations: epsilon-policy found or maximum number
    of iterations reached.
    
    Let S = number of states, A = number of actions.
    
    Parameters
    ----------
    P : transition matrix 
        P could be a numpy ndarray with 3 dimensions (AxSxS) or a 
        numpy ndarray of dytpe=object with 1 dimenion (1xA), each 
        element containing a numpy ndarray (SxS) or scipy sparse matrix. 
    R : reward matrix
        R could be a numpy ndarray with 3 dimensions (AxSxS) or numpy
        ndarray of dtype=object with 1 dimension (1xA), each element
        containing a sparse matrix (SxS). R also could be a numpy 
        ndarray with 2 dimensions (SxA) possibly sparse.
    discount : discount rate
        Greater than 0, less than or equal to 1. Beware to check conditions of
        convergence for discount = 1.
    epsilon : epsilon-optimal policy search
        Greater than 0, optional (default: 0.01).
    max_iter : maximum number of iterations to be done
        Greater than 0, optional (default: computed)
    initial_value : starting value function
        optional (default: zeros(S,1)).
    
    Data Attributes
    ---------------
    value : value function
Steven Cordwell's avatar
Steven Cordwell committed
1136 1137
        A vector which stores the optimal value function. Prior to calling the
        iterate() method it has a value of None. Shape is (S, ).
Steven Cordwell's avatar
Steven Cordwell committed
1138
    policy : epsilon-optimal policy
Steven Cordwell's avatar
Steven Cordwell committed
1139 1140 1141
        A vector which stores the optimal policy. Prior to calling the
        iterate() method it has a value of None. Shape is (S, ).
    iter : number of iterations taken to complete the computation
Steven Cordwell's avatar
Steven Cordwell committed
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
        An integer
    time : used CPU time
        A float
    
    Methods
    -------
    iterate()
        Starts the loop for the algorithm to be completed.
    setSilent()
        Sets the instance to silent mode.
    setVerbose()
        Sets the instance to verbose mode.
    
    Notes
    -----
    In verbose mode, at each iteration, displays the variation of V
    and the condition which stopped iterations: epsilon-optimum policy found
    or maximum number of iterations reached.
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> vi = mdp.ValueIteration(P, R, 0.96)
    >>> vi.verbose
    False
    >>> vi.iterate()
    >>> vi.value
    array([  5.93215488,   9.38815488,  13.38815488])
    >>> vi.policy
    array([0, 0, 0])
    >>> vi.iter
    4
    >>> vi.time
    0.002871990203857422
    
    >>> import mdp
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
    >>> vi = mdp.ValueIteration(P, R, 0.9)
    >>> vi.iterate()
    >>> vi.value
    array([ 40.04862539,  33.65371176])
    >>> vi.policy
    array([1, 0])
    >>> vi.iter
    26
    >>> vi.time
    0.010202884674072266
    
    >>> import mdp
    >>> import numpy as np
    >>> from scipy.sparse import csr_matrix as sparse
    >>> P = np.zeros((2, ), dtype=object)
    >>> P[0] = sparse([[0.5, 0.5],[0.8, 0.2]])
    >>> P[1] = sparse([[0, 1],[0.1, 0.9]])
    >>> R = np.array([[5, 10], [-1, 2]])
    >>> vi = mdp.ValueIteration(P, R, 0.9)
    >>> vi.iterate()
    >>> vi.value
    array([ 40.04862539,  33.65371176])
    >>> vi.policy
    array([1, 0])
    
Steven Cordwell's avatar
Steven Cordwell committed
1207
    """
Steven Cordwell's avatar
Steven Cordwell committed
1208
    
Steven Cordwell's avatar
Steven Cordwell committed
1209
    def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0):
Steven Cordwell's avatar
Steven Cordwell committed
1210
        """Resolution of discounted MDP with value iteration algorithm."""