mdp.py 59.3 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1 2
# -*- coding: utf-8 -*-
"""
3
Copyright (c) 2011, 2012, 2013 Steven Cordwell
Steven Cordwell's avatar
Steven Cordwell committed
4 5 6 7
Copyright (c) 2009, Iadine Chadès
Copyright (c) 2009, Marie-Josée Cros
Copyright (c) 2009, Frédérick Garcia
Copyright (c) 2009, Régis Sabbadin
Steven Cordwell's avatar
Steven Cordwell committed
8 9 10

All rights reserved.

Steven Cordwell's avatar
Steven Cordwell committed
11 12
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Steven Cordwell's avatar
Steven Cordwell committed
13 14 15 16 17 18

  * Redistributions of source code must retain the above copyright notice, this
    list of conditions and the following disclaimer.
  * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.
Steven Cordwell's avatar
Steven Cordwell committed
19 20 21
  * Neither the name of the <ORGANIZATION> nor the names of its contributors
    may be used to endorse or promote products derived from this software
    without specific prior written permission.
Steven Cordwell's avatar
Steven Cordwell committed
22 23 24 25

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Steven Cordwell's avatar
Steven Cordwell committed
26 27 28 29 30 31 32
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Steven Cordwell's avatar
Steven Cordwell committed
33 34
"""

Steven Cordwell's avatar
Steven Cordwell committed
35
from numpy import absolute, array, diag, matrix, mean, mod, multiply, ndarray
36
from numpy import nonzero, ones, zeros
Steven Cordwell's avatar
Steven Cordwell committed
37
from numpy.random import rand
Steven Cordwell's avatar
Steven Cordwell committed
38
from math import ceil, log, sqrt
Steven Cordwell's avatar
Steven Cordwell committed
39 40
from random import randint, random
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
41 42
from time import time

Steven Cordwell's avatar
Steven Cordwell committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
63
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
64 65 66 67 68 69 70 71 72 73
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
74
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
75
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
76
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
77 78 79 80 81 82 83 84 85 86
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
    "(S, A) with S : number of states greater than 0 and A : number of actions "
    "greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
87 88 89 90 91
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
92 93
}

Steven Cordwell's avatar
Steven Cordwell committed
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
def check(P, R):
    """Checks if the matrices P and R define a Markov Decision Process.
    
    Let S = number of states, A = number of actions.
    The transition matrix P must be on the shape (A, S, S) and P[a,:,:]
    must be stochastic.
    The reward matrix R must be on the shape (A, S, S) or (S, A).
    Raises an error if P and R do not define a MDP.
    
    Parameters
    ---------
    P : transition matrix (A, S, S)
        P could be an array with 3 dimensions or a object array (A, ),
        each cell containing a matrix (S, S) possibly sparse
    R : reward matrix (A, S, S) or (S, A)
        R could be an array with 3 dimensions (SxSxA) or a object array
        (A, ), each cell containing a sparse matrix (S, S) or a 2D
        array(S, A) possibly sparse  
    """
    
    # Check of P
    # tranitions must be a numpy array either an AxSxS ndarray (with any 
    # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, 
    # and each element containing an SxS array. An AxSxS array will be
    # be converted to an object array. A numpy object array is similar to a
    # MATLAB cell array.
    if (not type(P) is ndarray):
        raise TypeError(mdperr["P_type"])
    
    if (not type(R) is ndarray):
        raise TypeError(mdperr["R_type"])
    
    # NumPy has an array type of 'object', which is roughly equivalent to
    # the MATLAB cell array. These are most useful for storing sparse
    # matrices as these can only have two dimensions whereas we want to be
    # able to store a transition matrix for each action. If the dytpe of
    # the transition probability array is object then we store this as
    # P_is_object = True.
    # If it is an object array, then it should only have one dimension
    # otherwise fail with a message expalining why.
    # If it is a normal array then the number of dimensions must be exactly
    # three, otherwise fail with a message explaining why.
    if (P.dtype == object):
        if (P.ndim > 1):
            raise ValueError(mdperr["obj_shape"])
        else:
            P_is_object = True
    else:
        if (P.ndim != 3):
            raise ValueError(mdperr["P_shape"])
        else:
            P_is_object = False
    
    # As above but for the reward array. A difference is that the reward
    # array can have either two or 3 dimensions.
    if (R.dtype == object):
        if (R.ndim > 1):
            raise ValueError(mdperr["obj_shape"])
        else:
            R_is_object = True
    else:
        if (not R.ndim in (2, 3)):
            raise ValueError(mdperr["R_shape"])
        else:
            R_is_object = False
    
    # We want to make sure that the transition probability array and the 
    # reward array are in agreement. This means that both should show that
    # there are the same number of actions and the same number of states.
    # Furthermore the probability of transition matrices must be SxS in
    # shape, so we check for that also.
    if P_is_object:
        # If the user has put their transition matrices into a numpy array
        # with dtype of 'object', then it is possible that they have made a
        # mistake and not all of the matrices are of the same shape. So,
        # here we record the number of actions and states that the first
        # matrix in element zero of the object array says it has. After
        # that we check that every other matrix also reports the same
        # number of actions and states, otherwise fail with an error.
        # aP: the number of actions in the transition array. This
        # corresponds to the number of elements in the object array.
        aP = P.shape[0]
        # sP0: the number of states as reported by the number of rows of
        # the transition matrix
        # sP1: the number of states as reported by the number of columns of
        # the transition matrix
        sP0, sP1 = P[0].shape
        # Now we check to see that every element of the object array holds
        # a matrix of the same shape, otherwise fail.
        for aa in range(1, aP):
            # sp0aa and sp1aa represents the number of states in each
            # subsequent element of the object array. If it doesn't match
            # what was found in the first element, then we need to fail
            # telling the user what needs to be fixed.
            sP0aa, sP1aa = P[aa].shape
            if ((sP0aa != sP0) or (sP1aa != sP1)):
                raise ValueError(mdperr["obj_square"])
    else:
        # if we are using a normal array for this, then the first
        # dimension should be the number of actions, and the second and 
        # third should be the number of states
        aP, sP0, sP1 = P.shape
    
    # the first dimension of the transition matrix must report the same
    # number of states as the second dimension. If not then we are not
    # dealing with a square matrix and it is not a valid transition
    # probability. Also, if the number of actions is less than one, or the
    # number of states is less than one, then it also is not a valid
    # transition probability.
    if ((sP0 < 1) or (aP < 1) or (sP0 != sP1)):
        raise ValueError(mdperr["P_shape"])
    
    # now we check that each transition matrix is square-stochastic. For
    # object arrays this is the matrix held in each element, but for
    # normal arrays this is a matrix formed by taking a slice of the array
    for aa in range(aP):
        if P_is_object:
            checkSquareStochastic(P[aa])
        else:
            checkSquareStochastic(P[aa, :, :])
        # aa = aa + 1 # why was this here?
    
    if R_is_object:
        # if the rewarad array has an object dtype, then we check that
        # each element contains a matrix of the same shape as we did 
        # above with the transition array.
        aR = R.shape[0]
        sR0, sR1 = R[0].shape
        for aa in range(1, aR):
            sR0aa, sR1aa = R[aa].shape
            if ((sR0aa != sR0) or (sR1aa != sR1)):
                raise ValueError(mdperr["obj_square"])
    elif (R.ndim == 3):
        # This indicates that the reward matrices are constructed per 
        # transition, so that the first dimension is the actions and
        # the second two dimensions are the states.
        aR, sR0, sR1 = R.shape
    else:
        # then the reward matrix is per state, so the first dimension is 
        # the states and the second dimension is the actions.
        sR0, aR = R.shape
        # this is added just so that the next check doesn't error out
        # saying that sR1 doesn't exist
        sR1 = sR0
    
    # the number of actions must be more than zero, the number of states
    # must also be more than 0, and the states must agree
    if ((sR0 < 1) or (aR < 1) or (sR0 != sR1)):
        raise ValueError(mdperr["R_shape"])
    
    # now we check to see that what the transition array is reporting and
    # what the reward arrar is reporting agree as to the number of actions
    # and states. If not then fail explaining the situation
    if (sP0 != sR0) or (aP != aR):
        raise ValueError(mdperr["PR_incompat"])
        
    # We are at the end of the checks, so if no exceptions have been raised
    # then that means there are (hopefullly) no errors and we return None
    return None

def checkSquareStochastic(Z):
    """Check if Z is a square stochastic matrix
    
    Parameters
    ----------
    Z : a SxS matrix. It could be a numpy ndarray SxS, or a scipy.sparse 
        csr_matrix
    
    Evaluation
    ----------
    Returns None if no error has been detected
    """
    s1, s2 = Z.shape
    if (s1 != s2):
        raise ValueError(mdperr["mat_square"])
    elif (absolute(Z.sum(axis=1) - ones(s2))).max() > 10**(-12):
        raise ValueError(mdperr["mat_stoch"])
    elif ((type(Z) is ndarray) or (type(Z) is matrix)) and (Z < 0).any():
        raise ValueError(mdperr["mat_nonneg"])
    elif (type(Z) is sparse) and (Z.data < 0).any():
        raise ValueError(mdperr["mat_nonneg"]) 
    else:
        return(None)

Steven Cordwell's avatar
Steven Cordwell committed
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
def exampleForest(S=3, r1=4, r2=2, p=0.1):
    """
    Generates a Markov Decision Process example based on a simple forest
    management.
    
    See the related documentation for more detail.
    
    Parameters
    ---------
    S : number of states (> 0), optional (default 3)
    r1 : reward when forest is in the oldest state and action Wait is performed,
        optional (default 4)
    r2 : reward when forest is in the oldest state and action Cut is performed, 
        optional (default 2)
    p : probability of wild fire occurence, in ]0, 1[, optional (default 0.1)
    
    Evaluation
    ----------
    P : transition probability matrix (A, S, S)
    R : reward matrix (S, A)
Steven Cordwell's avatar
Steven Cordwell committed
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],

           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    
Steven Cordwell's avatar
Steven Cordwell committed
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
    """
    if (S <= 1):
        raise ValueError(mdperr["S_gt_1"])
    if (r1 <= 0) or (r2 <= 0):
        raise ValueError(mdperr["R_gt_0"])
    if (p < 0 or p > 1):
        raise ValueError(mdperr["prob_in01"])
    
    # Definition of Transition matrix P(:,:,1) associated to action Wait (action 1) and
    # P(:,:,2) associated to action Cut (action 2)
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
    P = zeros((2, S, S))
    P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
    P[0, :, 0] = p
    P[0, S - 1, S - 1] = (1 - p)
    P[1, :, :] = zeros((S, S))
    P[1, :, 0] = 1
    
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    
    return (P, R)

def exampleRand(S, A, is_sparse=False, mask=None):
    """Generates a random Markov Decision Process.
    
    Parameters
    ----------
    S : number of states (> 0)
    A : number of actions (> 0)
    is_sparse : false to have matrices in plain format, true to have sparse
        matrices optional (default false).
    mask : matrix with 0 and 1 (0 indicates a place for a zero
Steven Cordwell's avatar
Steven Cordwell committed
365
           probability), optional (SxS) (default, random)
Steven Cordwell's avatar
Steven Cordwell committed
366 367 368 369 370
    
    Returns
    ----------
    P : transition probability matrix (SxSxA)
    R : reward matrix (SxSxA)
Steven Cordwell's avatar
Steven Cordwell committed
371 372 373 374 375 376

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)

Steven Cordwell's avatar
Steven Cordwell committed
377 378 379 380 381 382 383 384 385 386 387
    """
    if (S < 1 or A < 1):
        raise ValueError(mdperr["SA_gt_1"])
    
    try:
        if (mask != None) and ((mask.shape[0] != S) or (mask.shape[1] != S)):
            raise ValueError(mdperr["mask_SbyS"])
    except AttributeError:
        raise TypeError(mdperr["mask_numpy"])
    
    if mask == None:
Steven Cordwell's avatar
Steven Cordwell committed
388 389 390 391 392
        mask = rand(A, S, S)
        for a in range(A):
            r = random()
            mask[a][mask[a] < r] = 0
            mask[a][mask[a] >= r] = 1
Steven Cordwell's avatar
Steven Cordwell committed
393 394 395
    
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
Steven Cordwell's avatar
Steven Cordwell committed
396
        P = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
397
        # definition of reward matrix (values between -1 and +1)
Steven Cordwell's avatar
Steven Cordwell committed
398
        R = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
399
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
400
            PP = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
401
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
402 403
                if (mask[a, s, :].sum() == 0):
                    PP[s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
404 405
                PP[s, :] = PP[s, :] / PP[s, :].sum()
            P[a] = sparse(PP)
Steven Cordwell's avatar
Steven Cordwell committed
406
            R[a] = sparse(mask[a] * (2 * rand(S, S) - ones((S, S))))
Steven Cordwell's avatar
Steven Cordwell committed
407 408 409 410 411 412
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
413
            P[a, :, :] = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
414
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
415 416
                if (mask[a, s, :].sum() == 0):
                    P[a, s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
417
                P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
Steven Cordwell's avatar
Steven Cordwell committed
418
            R[a, :, :] = mask[a] * (2 * rand(S, S) - ones((S, S), dtype=int))
Steven Cordwell's avatar
Steven Cordwell committed
419 420 421
    
    return (P, R)

422 423 424 425 426 427 428
def getSpan(self, W):
    """Returns the span of W
    
    sp(W) = max W(s) - min W(s)
    """
    return (W.max() - W.min())

Steven Cordwell's avatar
Steven Cordwell committed
429
class MDP(object):
Steven Cordwell's avatar
Steven Cordwell committed
430
    """The Markov Decision Problem Toolbox."""
Steven Cordwell's avatar
Steven Cordwell committed
431
    
432
    def __init__(self, transitions, reward, discount, max_iter):
Steven Cordwell's avatar
Steven Cordwell committed
433
        """"""
434
        
Steven Cordwell's avatar
Steven Cordwell committed
435 436
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
Steven Cordwell's avatar
Steven Cordwell committed
437 438 439 440 441 442
        if (type(discount) is int) or (type(discount) is float):
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
                self.discount = discount
        elif not discount is None:
Steven Cordwell's avatar
Steven Cordwell committed
443 444
            raise ValueError("PyMDPtoolbox: the discount must be a positive " \
                "real number less than or equal to one.")
445
        
Steven Cordwell's avatar
Steven Cordwell committed
446 447
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
Steven Cordwell's avatar
Steven Cordwell committed
448 449 450 451 452 453
        if (type(max_iter) is int) or (type(max_iter) is float):
            if (max_iter <= 0):
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
        elif not max_iter is None:
Steven Cordwell's avatar
Steven Cordwell committed
454 455
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "\
                "number greater than zero.")
456
        
Steven Cordwell's avatar
Steven Cordwell committed
457 458 459
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
460
        
Steven Cordwell's avatar
Steven Cordwell committed
461
        # computePR will assign the variables self.S, self.A, self.P and self.R
462 463
        self.computePR(transitions, reward)
        
Steven Cordwell's avatar
Steven Cordwell committed
464 465 466 467
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
468 469
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
470 471 472
        
        self.value = None
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
473
    
Steven Cordwell's avatar
Steven Cordwell committed
474
    def bellmanOperator(self):
Steven Cordwell's avatar
Steven Cordwell committed
475 476
        """
        Applies the Bellman operator on the value function.
Steven Cordwell's avatar
Steven Cordwell committed
477
        
Steven Cordwell's avatar
Steven Cordwell committed
478 479
        Updates the value function and the Vprev-improving policy.
        
480
        Returns
Steven Cordwell's avatar
Steven Cordwell committed
481
        -------
482
        (policy, value) : tuple of new policy and its value
Steven Cordwell's avatar
Steven Cordwell committed
483 484 485
        """
        Q = matrix(zeros((self.S, self.A)))
        for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
486
            Q[:, aa] = self.R[:, aa] + (self.discount * self.P[aa] * self.value)
Steven Cordwell's avatar
Steven Cordwell committed
487
        
488 489 490
        # Which way is better? if choose the first way, then the classes that
        # call this function must be changed
        # 1. Return, (policy, value)
491
        return (Q.argmax(axis=1), Q.max(axis=1))
Steven Cordwell's avatar
Steven Cordwell committed
492
        # 2. update self.policy and self.value directly
493 494
        # self.value = Q.max(axis=1)
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
495
    
Steven Cordwell's avatar
Steven Cordwell committed
496 497 498 499
    def computePpolicyPRpolicy(self):
        """Computes the transition matrix and the reward matrix for a policy

        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
500
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
501 502 503 504 505 506 507 508 509
        Let S = number of states, A = number of actions
        P(SxSxA)  = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
        R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
        policy(S) = a policy
Steven Cordwell's avatar
Steven Cordwell committed
510
        
Steven Cordwell's avatar
Steven Cordwell committed
511
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
512
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
513 514
        Ppolicy(SxS)  = transition matrix for policy
        PRpolicy(S)   = reward matrix for policy
Steven Cordwell's avatar
Steven Cordwell committed
515
        """
516 517 518 519 520 521 522 523 524
        Ppolicy = matrix(zeros((self.S, self.S)))
        Rpolicy = matrix(zeros((self.S, 1)))
        for aa in range(self.A): # avoid looping over S
        
            # the rows that use action a. .getA1() is used to make sure that
            # ind is a 1 dimensional vector
            ind = nonzero(self.policy == aa)[0].getA1()
            if ind.size > 0: # if no rows use action a, then no point continuing
                Ppolicy[ind, :] = self.P[aa][ind, :]
Steven Cordwell's avatar
Steven Cordwell committed
525
                
526 527 528 529 530 531 532 533 534
                #PR = self.computePR() # an apparently uneeded line, and
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
                Rpolicy[ind] = self.R[ind, aa]
        
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
Steven Cordwell's avatar
Steven Cordwell committed
535
        if type(self.R) is sparse:
536
            Rpolicy = sparse(Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
537 538
        
        #self.Ppolicy = Ppolicy
539 540
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
541 542 543 544
    
    def computePR(self, P, R):
        """Computes the reward for the system in one state chosing an action
        
Steven Cordwell's avatar
Steven Cordwell committed
545
        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
546
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
547 548 549 550 551 552 553 554
        Let S = number of states, A = number of actions
            P(SxSxA)  = transition matrix 
                P could be an array with 3 dimensions or  a cell array (1xA), 
                each cell containing a matrix (SxS) possibly sparse
            R(SxSxA) or (SxA) = reward matrix
                R could be an array with 3 dimensions (SxSxA) or  a cell array 
                (1xA), each cell containing a sparse matrix (SxS) or a 2D 
                array(SxA) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
555
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
556
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
557 558
            PR(SxA)   = reward matrix
        """
Steven Cordwell's avatar
Steven Cordwell committed
559 560 561
        # we assume that P and R define a MDP i,e. assumption is that
        # check(P, R) has already been run and doesn't fail.
        
Steven Cordwell's avatar
Steven Cordwell committed
562
        # make P be an object array with (S, S) shaped array elements
563
        if (P.dtype is object):
Steven Cordwell's avatar
Steven Cordwell committed
564 565 566 567 568 569 570 571 572 573 574
            self.P = P
            self.A = self.P.shape[0]
            self.S = self.P[0].shape[0]
        else: # convert to an object array
            self.A = P.shape[0]
            self.S = P.shape[1]
            self.P = zeros(self.A, dtype=object)
            for aa in range(self.A):
                self.P[aa] = P[aa, :, :]
        
        # make R have the shape (S, A)
575 576
        if R.dtype is object:
            # R is object shaped (A,) with each element shaped (S, S)
Steven Cordwell's avatar
Steven Cordwell committed
577
            self.R = zeros((self.S, self.A))
578 579 580 581 582 583
            for aa in range(self.A):
                self.R[:, aa] = multiply(P[aa], R[aa]).sum(1)
        else:
            if R.ndim == 2:
                # R already has shape (S, A)
                self.R = R
Steven Cordwell's avatar
Steven Cordwell committed
584
            else:
585 586
                # R has shape (A, S, S)
                self.R = zeros((self.S, self.A))
Steven Cordwell's avatar
Steven Cordwell committed
587
                for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
588
                    self.R[:, aa] = multiply(P[aa], R[aa, :, :]).sum(1)
Steven Cordwell's avatar
Steven Cordwell committed
589 590 591 592 593 594 595
        
        # convert the arrays to numpy matrices
        for aa in range(self.A):
            if (type(self.P[aa]) is ndarray):
                self.P[aa] = matrix(self.P[aa])
        if (type(self.R) is ndarray):
            self.R = matrix(self.R)
596 597 598 599 600

    def iterate(self):
        """This is a placeholder method. Child classes should define their own
        iterate() method.
        """
Steven Cordwell's avatar
Steven Cordwell committed
601
        raise NotImplementedError("You should create an iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
602
    
Steven Cordwell's avatar
Steven Cordwell committed
603 604 605 606 607 608 609
    def setSilent(self):
        """Ask for running resolution functions of the MDP Toolbox in silent
        mode.
        """
        self.verbose = False
    
    def setVerbose(self):
Steven Cordwell's avatar
Steven Cordwell committed
610 611 612
        """Ask for running resolution functions of the MDP Toolbox in verbose
        mode.
        """
Steven Cordwell's avatar
Steven Cordwell committed
613
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
614 615

class FiniteHorizon(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
    """Reolution of finite-horizon MDP with backwards induction
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    Evaluation
    ----------
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.

Steven Cordwell's avatar
Steven Cordwell committed
647
    """
Steven Cordwell's avatar
Steven Cordwell committed
648

Steven Cordwell's avatar
Steven Cordwell committed
649 650
    def __init__(self, transitions, reward, discount, N, h=None):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
651
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
652
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
653
        else:
Steven Cordwell's avatar
Steven Cordwell committed
654
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
655
        
Steven Cordwell's avatar
Steven Cordwell committed
656
        MDP.__init__(self, transitions, reward, discount, None)
Steven Cordwell's avatar
Steven Cordwell committed
657
        
Steven Cordwell's avatar
Steven Cordwell committed
658
        self.value = zeros(self.S, N + 1)
Steven Cordwell's avatar
Steven Cordwell committed
659
        
Steven Cordwell's avatar
Steven Cordwell committed
660 661
        if not h is None:
            self.value[:, N + 1] = h
Steven Cordwell's avatar
Steven Cordwell committed
662
        
Steven Cordwell's avatar
Steven Cordwell committed
663 664
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
665 666
        self.time = time()
        
Steven Cordwell's avatar
Steven Cordwell committed
667 668 669 670 671 672
        for n in range(self.N - 1):
            W, X = self.bellmanOperator(self.P, self.R, self.discount, self.value[:, self.N - n + 1])
            self.value[:, self.N - n] = W
            self.policy[:, self.N - n] = X
            if self.verbose:
                print("stage: %s ... policy transpose : %s") % (self.N - n, self.policy[:, self.N - n].T)
Steven Cordwell's avatar
Steven Cordwell committed
673 674
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
675 676

class LP(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
    """Resolution of discounted MDP with linear programming

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
704
    """
Steven Cordwell's avatar
Steven Cordwell committed
705

Steven Cordwell's avatar
Steven Cordwell committed
706 707
    def __init__(self, transitions, reward, discount):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
708 709 710
        
        try:
            from cvxopt import matrix, solvers
Steven Cordwell's avatar
Steven Cordwell committed
711
            self.linprog = solvers.lp
Steven Cordwell's avatar
Steven Cordwell committed
712
        except ImportError:
Steven Cordwell's avatar
Steven Cordwell committed
713 714
            raise ImportError("The python module cvxopt is required to use " \
                "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
715
        
Steven Cordwell's avatar
Steven Cordwell committed
716
        from scipy.sparse import eye as speye
Steven Cordwell's avatar
Steven Cordwell committed
717

Steven Cordwell's avatar
Steven Cordwell committed
718
        MDP.__init__(self, transitions, reward, discount, None)
Steven Cordwell's avatar
Steven Cordwell committed
719 720 721 722 723 724 725
        
        # The objective is to resolve : min V / V >= PR + discount*P*V
        # The function linprog of the optimisation Toolbox of Mathworks resolves :
        # min f'* x / M * x <= b
        # So the objective could be expressed as : min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions M(A*S,S)
    
Steven Cordwell's avatar
Steven Cordwell committed
726
        self.f = ones(self.S, 1)
Steven Cordwell's avatar
Steven Cordwell committed
727
    
Steven Cordwell's avatar
Steven Cordwell committed
728 729 730 731
        self.M = zeros((self.A * self.S, self.S))
        for aa in range(self.A):
            pos = (aa + 1) * self.S
            self.M[(pos - self.S):pos, :] = discount * self.P[aa] - speye(self.S, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
732
        
Steven Cordwell's avatar
Steven Cordwell committed
733 734 735 736
        self.M = matrix(self.M)
    
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
737 738
        self.time = time()
        
Steven Cordwell's avatar
Steven Cordwell committed
739
        self.value = self.linprog(self.f, self.M, -self.R)
Steven Cordwell's avatar
Steven Cordwell committed
740
        
Steven Cordwell's avatar
Steven Cordwell committed
741
        self.value, self.policy =  self.bellmanOperator(self.P, self.R, self.discount, self.value)
Steven Cordwell's avatar
Steven Cordwell committed
742 743
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
744 745 746

class PolicyIteration(MDP):
    """Resolution of discounted MDP with policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
747
    
Steven Cordwell's avatar
Steven Cordwell committed
748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
778 779 780 781 782 783 784
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    >>> pi = mdp.PolicyIteration(P, R, 0.9)
    >>> pi.iterate()
    
Steven Cordwell's avatar
Steven Cordwell committed
785
    """
Steven Cordwell's avatar
Steven Cordwell committed
786
    
Steven Cordwell's avatar
Steven Cordwell committed
787
    def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
788 789
        """"""
        
790
        MDP.__init__(self, transitions, reward, discount, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
791
        
Steven Cordwell's avatar
Steven Cordwell committed
792 793 794 795
        if policy0 == None:
            # initialise the policy to the one which maximises the expected
            # immediate reward
            self.value = matrix(zeros((self.S, 1)))
796 797
            self.policy, null = self.bellmanOperator()
            del null
Steven Cordwell's avatar
Steven Cordwell committed
798 799 800 801 802 803 804 805 806
        else:
            policy0 = array(policy0)
            
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
                raise ValueError('PyMDPtolbox: policy0 must a vector with length S')
            
            policy0 = matrix(policy0.reshape(self.S, 1))
            
            if mod(policy0, 1).any() or (policy0 < 0).any() or (policy0 >= self.S).any():
Steven Cordwell's avatar
Steven Cordwell committed
807
                raise ValueError('PyMDPtoolbox: policy0 must be a vector of integers between 1 and S')
Steven Cordwell's avatar
Steven Cordwell committed
808 809 810 811
            else:
                self.policy = policy0
        
        # set or reset the initial values to zero
812
        self.value = matrix(zeros((self.S, 1)))
Steven Cordwell's avatar
Steven Cordwell committed
813
        
Steven Cordwell's avatar
Steven Cordwell committed
814
        if eval_type in (0, "matrix"):
815
            from numpy.linalg import solve
816 817
            from scipy.sparse import eye
            self.speye = eye
818
            self.lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "\
                "evaluation or 1 for iterative evaluation. strings 'matrix' " \
                "and 'iterative' can also be used.")
    
    def evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
        """Policy evaluation using iteration
        
        Arguments
        ---------
        Let S = number of states, A = number of actions
        P(SxSxA)  = transition matrix 
            P could be an array with 3 dimensions or 
            a cell array (1xS), each cell containing a matrix possibly sparse
        R(SxSxA) or (SxA) = reward matrix
            R could be an array with 3 dimensions (SxSxA) or 
            a cell array (1xA), each cell containing a sparse matrix (SxS) or
            a 2D array(SxA) possibly sparse  
        discount  = discount rate in ]0; 1[
        policy(S) = a policy
        V0(S)     = starting value function, optional (default : zeros(S,1))
        epsilon   = epsilon-optimal policy search, upper than 0,
            optional (default : 0.0001)
        max_iter  = maximum number of iteration to be done, upper than 0, 
            optional (default : 10000)
            
        Evaluation
        ----------
        Vpolicy(S) = value function, associated to a specific policy
        
        Notes
        -----
        In verbose mode, at each iteration, displays the condition which stopped iterations:
        epsilon-optimum value function found or maximum number of iterations reached.
        """
        if V0 == 0:
858
            policy_V = zeros((self.S, 1))
Steven Cordwell's avatar
Steven Cordwell committed
859 860 861
        else:
            raise NotImplementedError("evalPolicyIterative: case V0 != 0 not implemented. Use V0=0 instead.")
        
862
        policy_P, policy_R = self.computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
863 864 865
        
        if self.verbose:
            print('  Iteration    V_variation')
866
        
Steven Cordwell's avatar
Steven Cordwell committed
867 868 869 870
        itr = 0
        done = False
        while not done:
            itr = itr + 1
871 872 873 874 875
            
            Vprev = policy_V
            policy_V = policy_R + self.discount * policy_P * Vprev
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
876 877
            if self.verbose:
                print('      %s         %s') % (itr, variation)
878
                
Steven Cordwell's avatar
Steven Cordwell committed
879 880 881
            if variation < ((1 - self.discount) / self.discount) * epsilon: # to ensure |Vn - Vpolicy| < epsilon
                done = True
                if self.verbose:
882
                    print('PyMDPtoolbox: iterations stopped, epsilon-optimal value function')
Steven Cordwell's avatar
Steven Cordwell committed
883 884 885
            elif itr == max_iter:
                done = True
                if self.verbose:
886
                    print('PyMDPtoolbox: iterations stopped by maximum number of iteration condition')
Steven Cordwell's avatar
Steven Cordwell committed
887
        
888
        self.value = policy_V
889 890
    
    def evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
        """Evaluation of the value function of a policy
        
        Arguments 
        ---------
        Let S = number of states, A = number of actions
        P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
        R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
        discount = discount rate in ]0; 1[
        policy(S) = a policy
        
        Evaluation
        ----------
        Vpolicy(S) = value function of the policy
        """
        
911
        Ppolicy, Rpolicy = self.computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
912
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
913
        self.value = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy) , Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
914 915
    
    def iterate(self):
916
        """Run the policy iteration algorithm."""
Steven Cordwell's avatar
Steven Cordwell committed
917
        
918 919 920
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
        
Steven Cordwell's avatar
Steven Cordwell committed
921
        done = False
922
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
923 924
        
        while not done:
925 926
            self.iter = self.iter + 1
            
Steven Cordwell's avatar
Steven Cordwell committed
927 928
            # these evalPolicy* functions will update the classes value
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
929 930 931 932
            if self.eval_type == "matrix":
                self.evalPolicyMatrix()
            elif self.eval_type == "iterative":
                self.evalPolicyIterative()
933
            
Steven Cordwell's avatar
Steven Cordwell committed
934 935
            # This should update the classes policy attribute but leave the
            # value alone
936 937
            policy_next, null = self.bellmanOperator()
            del null
Steven Cordwell's avatar
Steven Cordwell committed
938
            
939
            n_different = (policy_next != self.policy).sum()
940 941 942 943
            
            if self.verbose:
                print('       %s                 %s') % (self.iter, n_different)
            
944
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
945
                done = True
946 947 948 949 950 951 952 953
                if self.verbose:
                    print("...iterations stopped, unchanging policy found")
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
                    print("...iterations stopped by maximum number of iteration condition")
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
954
        
955 956
        self.time = time() - self.time
        
Steven Cordwell's avatar
Steven Cordwell committed
957 958 959
        # store value and policy as tuples
        self.value = tuple(array(self.value).reshape(self.S).tolist())
        self.policy = tuple(array(self.policy).reshape(self.S).tolist())
Steven Cordwell's avatar
Steven Cordwell committed
960 961

class PolicyIterationModified(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996
    """Resolution of discounted MDP  with policy iteration algorithm
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
    >>> import mdp
Steven Cordwell's avatar
Steven Cordwell committed
997
    """
998 999
    
    def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10):
Steven Cordwell's avatar
Steven Cordwell committed
1000 1001
        """"""
        
Steven Cordwell's avatar
Steven Cordwell committed
1002
        MDP.__init__(self, transitions, reward, discount, max_iter)
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
        
        if epsilon <= 0:
            raise ValueError("epsilon must be greater than 0")
        
        # computation of threshold of variation for V for an epsilon-optimal policy
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
        if discount == 1:
            self.value = matrix(zeros((self.S, 1)))
        else:
            # min(min()) is not right
Steven Cordwell's avatar
Steven Cordwell committed
1017 1018
            self.value = 1 / (1 - discount) * min(min(self.R)) * ones((self.S, 1))
    
1019 1020 1021 1022 1023 1024
    def iterate(self):
        """"""
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
1025
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
1026
        
1027 1028 1029
        done = False
        while not done:
            self.iter = self.iter + 1
1030
            
1031 1032
            Vnext, policy = self.bellmanOperator(self.P, self.PR, self.discount, self.V)
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
1033
            
1034
            variation = getSpan(Vnext - self.value);
1035 1036 1037
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
1038 1039
            self.value = Vnext
            if variation < self.thresh:
1040 1041 1042 1043
                done = True
            else:
                is_verbose = False
                if self.verbose:
Steven Cordwell's avatar
Steven Cordwell committed
1044
                    self.setSilent
1045 1046
                    is_verbose = True
                
Steven Cordwell's avatar
Steven Cordwell committed
1047
                self.value = self.evalPolicyIterative()
1048 1049
                
                if is_verbose:
Steven Cordwell's avatar
Steven Cordwell committed
1050
                    self.setVerbose
1051
        
1052
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
1053 1054

class QLearning(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
    """Evaluates the matrix Q, using the Q learning algorithm.
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
        Default value = 10000; it is an integer greater than the default value.
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
    value : learned value function (S).
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

1085
    ExamplesPP[:, aa] = self.P[aa][:, ss]
Steven Cordwell's avatar
Steven Cordwell committed
1086 1087 1088 1089 1090 1091 1092
    ---------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> ql = mdp.QLearning(P, R, 0.96)
    >>> ql.iterate()
    >>> ql.Q
    array([[  0.        ,   0.        ],
Steven Cordwell's avatar
Steven Cordwell committed
1093 1094
           [  0.01062959,   0.79870231],
           [ 10.08191776,   0.35309404]])
Steven Cordwell's avatar
Steven Cordwell committed
1095
    >>> ql.value
Steven Cordwell's avatar
Steven Cordwell committed
1096
    array([  0.        ,   0.79870231,  10.08191776])
Steven Cordwell's avatar
Steven Cordwell committed
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
    >>> ql.policy
    array([0, 1, 0])
    
    >>> import mdp
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
    >>> ql = mdp.QLearning(P, R, 0.9)
    >>> ql.iterate()
    >>> ql.Q
Steven Cordwell's avatar
Steven Cordwell committed
1107 1108
    array([[ 94.99525115,  99.99999007],
           [ 53.92930199,   5.57331205]])
Steven Cordwell's avatar
Steven Cordwell committed
1109
    >>> ql.value
Steven Cordwell's avatar
Steven Cordwell committed
1110
    array([ 99.99999007,  53.92930199])
Steven Cordwell's avatar
Steven Cordwell committed
1111
    >>> ql.policy
Steven Cordwell's avatar
Steven Cordwell committed
1112 1113 1114
    array([1, 0])
    >>> ql.time
    0.6501460075378418
Steven Cordwell's avatar
Steven Cordwell committed
1115
    
Steven Cordwell's avatar
Steven Cordwell committed
1116 1117 1118 1119 1120 1121
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000):
        """Evaluation of the matrix Q, using the Q learning algorithm
        """
        
1122 1123 1124 1125
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        if (n_iter < 10000):
            raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000")
Steven Cordwell's avatar
Steven Cordwell committed
1126
        
1127 1128
        # after this n_iter will be known as self.max_iter
        MDP.__init__(self, transitions, reward, discount, n_iter)
Steven Cordwell's avatar
Steven Cordwell committed
1129 1130 1131 1132 1133
        
        # Initialisations
        self.Q = zeros((self.S, self.A))
        #self.dQ = zeros(self.S, self.A)
        self.mean_discrepancy = []
Steven Cordwell's avatar
Steven Cordwell committed
1134
        self.discrepancy = []
Steven Cordwell's avatar
Steven Cordwell committed
1135 1136 1137 1138 1139 1140 1141
        
    def iterate(self):
        """
        """
        self.time = time()
        
        # initial state choice
Steven Cordwell's avatar
Steven Cordwell committed
1142
        # s = randint(0, self.S - 1)
Steven Cordwell's avatar
Steven Cordwell committed
1143
        
1144
        for n in range(self.max_iter):
Steven Cordwell's avatar
Steven Cordwell committed
1145 1146 1147
            
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
Steven Cordwell's avatar
Steven Cordwell committed
1148
                s = randint(0, self.S - 1)
Steven Cordwell's avatar
Steven Cordwell committed
1149 1150 1151 1152 1153
            
            # Action choice : greedy with increasing probability
            # probability 1-(1/log(n+2)) can be changed
            pn = random()
            if (pn < (1 - (1 / log(n + 2)))):
Steven Cordwell's avatar
Steven Cordwell committed
1154 1155
                # optimal_action = self.Q[s, :].max()
                a = self.Q[s, :].argmax()
Steven Cordwell's avatar
Steven Cordwell committed
1156 1157
            else:
                a = randint(0, self.A - 1)
Steven Cordwell's avatar
Steven Cordwell committed
1158
            
Steven Cordwell's avatar
Steven Cordwell committed
1159 1160 1161
            # Simulating next state s_new and reward associated to <s,s_new,a>
            p_s_new = random()
            p = 0
Steven Cordwell's avatar
Steven Cordwell committed
1162
            s_new = -1
Steven Cordwell's avatar
Steven Cordwell committed
1163 1164
            while ((p < p_s_new) and (s_new < s)):
                s_new = s_new + 1
Steven Cordwell's avatar
Steven Cordwell committed
1165 1166 1167
                p = p + self.P[a][s, s_new]
            
            if (self.R.dtype == object):
Steven Cordwell's avatar
Steven Cordwell committed
1168 1169
                r = self.R[a][s, s_new]
            elif (self.R.ndim == 3):
Steven Cordwell's avatar
Steven Cordwell committed
1170
                r = self.R[a, s, s_new]
Steven Cordwell's avatar
Steven Cordwell committed
1171
            else:
Steven Cordwell's avatar
Steven Cordwell committed
1172 1173
                r = self.R[s, a]
            
Steven Cordwell's avatar
Steven Cordwell committed
1174 1175
            # Updating the value of Q   
            # Decaying update coefficient (1/sqrt(n+2)) can be changed
Steven Cordwell's avatar
Steven Cordwell committed
1176
            delta = r + self.discount * self.Q[s_new, :].max() - self.Q[s, a]
Steven Cordwell's avatar
Steven Cordwell committed
1177 1178