mdp.py 61.1 KB
Newer Older
Steven Cordwell's avatar
Steven Cordwell committed
1 2
# -*- coding: utf-8 -*-
"""
3
Copyright (c) 2011, 2012, 2013 Steven Cordwell
Steven Cordwell's avatar
Steven Cordwell committed
4 5 6 7
Copyright (c) 2009, Iadine Chadès
Copyright (c) 2009, Marie-Josée Cros
Copyright (c) 2009, Frédérick Garcia
Copyright (c) 2009, Régis Sabbadin
Steven Cordwell's avatar
Steven Cordwell committed
8 9 10

All rights reserved.

Steven Cordwell's avatar
Steven Cordwell committed
11 12
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Steven Cordwell's avatar
Steven Cordwell committed
13 14 15 16 17 18

  * Redistributions of source code must retain the above copyright notice, this
    list of conditions and the following disclaimer.
  * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.
Steven Cordwell's avatar
Steven Cordwell committed
19 20 21
  * Neither the name of the <ORGANIZATION> nor the names of its contributors
    may be used to endorse or promote products derived from this software
    without specific prior written permission.
Steven Cordwell's avatar
Steven Cordwell committed
22 23 24 25

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Steven Cordwell's avatar
Steven Cordwell committed
26 27 28 29 30 31 32
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Steven Cordwell's avatar
Steven Cordwell committed
33 34
"""

Steven Cordwell's avatar
Steven Cordwell committed
35
from numpy import absolute, array, diag, matrix, mean, mod, multiply, ndarray
36
from numpy import ones, zeros
Steven Cordwell's avatar
Steven Cordwell committed
37
from numpy.random import rand
Steven Cordwell's avatar
Steven Cordwell committed
38
from math import ceil, log, sqrt
Steven Cordwell's avatar
Steven Cordwell committed
39 40
from random import randint, random
from scipy.sparse import csr_matrix as sparse
Steven Cordwell's avatar
Steven Cordwell committed
41 42
from time import time

Steven Cordwell's avatar
Steven Cordwell committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
mdperr = {
"mat_nonneg" :
    "PyMDPtoolbox: Probabilities must be non-negative.",
"mat_square" :
    "PyMDPtoolbox: The matrix must be square.",
"mat_stoch" :
    "PyMDPtoolbox: Rows of the matrix must sum to one (1).",
"mask_numpy" :
    "PyMDPtoolbox: mask must be a numpy array or matrix; i.e. type(mask) is "
    "ndarray or type(mask) is matrix.", 
"mask_SbyS" : 
    "PyMDPtoolbox: The mask must have shape SxS; i.e. mask.shape = (S, S).",
"obj_shape" :
    "PyMDPtoolbox: Object arrays for transition probabilities and rewards "
    "must have only 1 dimension: the number of actions A. Each element of "
    "the object array contains an SxS ndarray or matrix.",
"obj_square" :
    "PyMDPtoolbox: Each element of an object array for transition "
    "probabilities and rewards must contain an SxS ndarray or matrix; i.e. "
    "P[a].shape = (S, S) or R[a].shape = (S, S).",
Steven Cordwell's avatar
Steven Cordwell committed
63
"P_type" :
Steven Cordwell's avatar
Steven Cordwell committed
64 65 66 67 68 69 70 71 72 73
    "PyMDPtoolbox: The transition probabilities must be in a numpy array; "
    "i.e. type(P) is ndarray.",
"P_shape" :
    "PyMDPtoolbox: The transition probability array must have the shape "
    "(A, S, S)  with S : number of states greater than 0 and A : number of "
    "actions greater than 0. i.e. R.shape = (A, S, S)",
"PR_incompat" :
    "PyMDPtoolbox: Incompatibility between P and R dimensions.",
"prob_in01" :
    "PyMDPtoolbox: Probability p must be in [0; 1].",
Steven Cordwell's avatar
Steven Cordwell committed
74
"R_type" :
Steven Cordwell's avatar
Steven Cordwell committed
75
    "PyMDPtoolbox: The rewards must be in a numpy array; i.e. type(R) is "
Steven Cordwell's avatar
Steven Cordwell committed
76
    "ndarray, or numpy matrix; i.e. type(R) is matrix.",
Steven Cordwell's avatar
Steven Cordwell committed
77 78 79 80 81 82 83 84 85 86
"R_shape" :
    "PyMDPtoolbox: The reward matrix R must be an array of shape (A, S, S) or "
    "(S, A) with S : number of states greater than 0 and A : number of actions "
    "greater than 0. i.e. R.shape = (S, A) or (A, S, S).",
"R_gt_0" :
    "PyMDPtoolbox: The rewards must be greater than 0.",
"S_gt_1" :
    "PyMDPtoolbox: Number of states S must be greater than 1.",
"SA_gt_1" : 
    "PyMDPtoolbox: The number of states S and the number of actions A must be "
87 88 89 90 91
    "greater than 1.",
"discount_rng" : 
    "PyMDPtoolbox: Discount rate must be in ]0; 1]",
"maxi_min" :
    "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
Steven Cordwell's avatar
Steven Cordwell committed
92 93
}

Steven Cordwell's avatar
Steven Cordwell committed
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
def check(P, R):
    """Checks if the matrices P and R define a Markov Decision Process.
    
    Let S = number of states, A = number of actions.
    The transition matrix P must be on the shape (A, S, S) and P[a,:,:]
    must be stochastic.
    The reward matrix R must be on the shape (A, S, S) or (S, A).
    Raises an error if P and R do not define a MDP.
    
    Parameters
    ---------
    P : transition matrix (A, S, S)
        P could be an array with 3 dimensions or a object array (A, ),
        each cell containing a matrix (S, S) possibly sparse
    R : reward matrix (A, S, S) or (S, A)
        R could be an array with 3 dimensions (SxSxA) or a object array
        (A, ), each cell containing a sparse matrix (S, S) or a 2D
        array(S, A) possibly sparse  
    """
    
    # Check of P
    # tranitions must be a numpy array either an AxSxS ndarray (with any 
    # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, 
    # and each element containing an SxS array. An AxSxS array will be
    # be converted to an object array. A numpy object array is similar to a
    # MATLAB cell array.
    if (not type(P) is ndarray):
        raise TypeError(mdperr["P_type"])
    
    if (not type(R) is ndarray):
        raise TypeError(mdperr["R_type"])
    
    # NumPy has an array type of 'object', which is roughly equivalent to
    # the MATLAB cell array. These are most useful for storing sparse
    # matrices as these can only have two dimensions whereas we want to be
    # able to store a transition matrix for each action. If the dytpe of
    # the transition probability array is object then we store this as
    # P_is_object = True.
    # If it is an object array, then it should only have one dimension
    # otherwise fail with a message expalining why.
    # If it is a normal array then the number of dimensions must be exactly
    # three, otherwise fail with a message explaining why.
    if (P.dtype == object):
        if (P.ndim > 1):
            raise ValueError(mdperr["obj_shape"])
        else:
            P_is_object = True
    else:
        if (P.ndim != 3):
            raise ValueError(mdperr["P_shape"])
        else:
            P_is_object = False
    
    # As above but for the reward array. A difference is that the reward
    # array can have either two or 3 dimensions.
    if (R.dtype == object):
        if (R.ndim > 1):
            raise ValueError(mdperr["obj_shape"])
        else:
            R_is_object = True
    else:
        if (not R.ndim in (2, 3)):
            raise ValueError(mdperr["R_shape"])
        else:
            R_is_object = False
    
    # We want to make sure that the transition probability array and the 
    # reward array are in agreement. This means that both should show that
    # there are the same number of actions and the same number of states.
    # Furthermore the probability of transition matrices must be SxS in
    # shape, so we check for that also.
    if P_is_object:
        # If the user has put their transition matrices into a numpy array
        # with dtype of 'object', then it is possible that they have made a
        # mistake and not all of the matrices are of the same shape. So,
        # here we record the number of actions and states that the first
        # matrix in element zero of the object array says it has. After
        # that we check that every other matrix also reports the same
        # number of actions and states, otherwise fail with an error.
        # aP: the number of actions in the transition array. This
        # corresponds to the number of elements in the object array.
        aP = P.shape[0]
        # sP0: the number of states as reported by the number of rows of
        # the transition matrix
        # sP1: the number of states as reported by the number of columns of
        # the transition matrix
        sP0, sP1 = P[0].shape
        # Now we check to see that every element of the object array holds
        # a matrix of the same shape, otherwise fail.
        for aa in range(1, aP):
            # sp0aa and sp1aa represents the number of states in each
            # subsequent element of the object array. If it doesn't match
            # what was found in the first element, then we need to fail
            # telling the user what needs to be fixed.
            sP0aa, sP1aa = P[aa].shape
            if ((sP0aa != sP0) or (sP1aa != sP1)):
                raise ValueError(mdperr["obj_square"])
    else:
        # if we are using a normal array for this, then the first
        # dimension should be the number of actions, and the second and 
        # third should be the number of states
        aP, sP0, sP1 = P.shape
    
    # the first dimension of the transition matrix must report the same
    # number of states as the second dimension. If not then we are not
    # dealing with a square matrix and it is not a valid transition
    # probability. Also, if the number of actions is less than one, or the
    # number of states is less than one, then it also is not a valid
    # transition probability.
    if ((sP0 < 1) or (aP < 1) or (sP0 != sP1)):
        raise ValueError(mdperr["P_shape"])
    
    # now we check that each transition matrix is square-stochastic. For
    # object arrays this is the matrix held in each element, but for
    # normal arrays this is a matrix formed by taking a slice of the array
    for aa in range(aP):
        if P_is_object:
            checkSquareStochastic(P[aa])
        else:
            checkSquareStochastic(P[aa, :, :])
        # aa = aa + 1 # why was this here?
    
    if R_is_object:
        # if the rewarad array has an object dtype, then we check that
        # each element contains a matrix of the same shape as we did 
        # above with the transition array.
        aR = R.shape[0]
        sR0, sR1 = R[0].shape
        for aa in range(1, aR):
            sR0aa, sR1aa = R[aa].shape
            if ((sR0aa != sR0) or (sR1aa != sR1)):
                raise ValueError(mdperr["obj_square"])
    elif (R.ndim == 3):
        # This indicates that the reward matrices are constructed per 
        # transition, so that the first dimension is the actions and
        # the second two dimensions are the states.
        aR, sR0, sR1 = R.shape
    else:
        # then the reward matrix is per state, so the first dimension is 
        # the states and the second dimension is the actions.
        sR0, aR = R.shape
        # this is added just so that the next check doesn't error out
        # saying that sR1 doesn't exist
        sR1 = sR0
    
    # the number of actions must be more than zero, the number of states
    # must also be more than 0, and the states must agree
    if ((sR0 < 1) or (aR < 1) or (sR0 != sR1)):
        raise ValueError(mdperr["R_shape"])
    
    # now we check to see that what the transition array is reporting and
    # what the reward arrar is reporting agree as to the number of actions
    # and states. If not then fail explaining the situation
    if (sP0 != sR0) or (aP != aR):
        raise ValueError(mdperr["PR_incompat"])
        
    # We are at the end of the checks, so if no exceptions have been raised
    # then that means there are (hopefullly) no errors and we return None
    return None

def checkSquareStochastic(Z):
    """Check if Z is a square stochastic matrix
    
    Parameters
    ----------
    Z : a SxS matrix. It could be a numpy ndarray SxS, or a scipy.sparse 
        csr_matrix
    
    Evaluation
    ----------
    Returns None if no error has been detected
    """
    s1, s2 = Z.shape
    if (s1 != s2):
        raise ValueError(mdperr["mat_square"])
    elif (absolute(Z.sum(axis=1) - ones(s2))).max() > 10**(-12):
        raise ValueError(mdperr["mat_stoch"])
    elif ((type(Z) is ndarray) or (type(Z) is matrix)) and (Z < 0).any():
        raise ValueError(mdperr["mat_nonneg"])
    elif (type(Z) is sparse) and (Z.data < 0).any():
        raise ValueError(mdperr["mat_nonneg"]) 
    else:
        return(None)

Steven Cordwell's avatar
Steven Cordwell committed
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
def exampleForest(S=3, r1=4, r2=2, p=0.1):
    """
    Generates a Markov Decision Process example based on a simple forest
    management.
    
    See the related documentation for more detail.
    
    Parameters
    ---------
    S : number of states (> 0), optional (default 3)
    r1 : reward when forest is in the oldest state and action Wait is performed,
        optional (default 4)
    r2 : reward when forest is in the oldest state and action Cut is performed, 
        optional (default 2)
    p : probability of wild fire occurence, in ]0, 1[, optional (default 0.1)
    
    Evaluation
    ----------
    P : transition probability matrix (A, S, S)
    R : reward matrix (S, A)
Steven Cordwell's avatar
Steven Cordwell committed
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    
Steven Cordwell's avatar
Steven Cordwell committed
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
    """
    if (S <= 1):
        raise ValueError(mdperr["S_gt_1"])
    if (r1 <= 0) or (r2 <= 0):
        raise ValueError(mdperr["R_gt_0"])
    if (p < 0 or p > 1):
        raise ValueError(mdperr["prob_in01"])
    
    # Definition of Transition matrix P(:,:,1) associated to action Wait (action 1) and
    # P(:,:,2) associated to action Cut (action 2)
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
    P = zeros((2, S, S))
    P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
    P[0, :, 0] = p
    P[0, S - 1, S - 1] = (1 - p)
    P[1, :, :] = zeros((S, S))
    P[1, :, 0] = 1
    
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    
    return (P, R)

def exampleRand(S, A, is_sparse=False, mask=None):
    """Generates a random Markov Decision Process.
    
    Parameters
    ----------
    S : number of states (> 0)
    A : number of actions (> 0)
    is_sparse : false to have matrices in plain format, true to have sparse
        matrices optional (default false).
    mask : matrix with 0 and 1 (0 indicates a place for a zero
Steven Cordwell's avatar
Steven Cordwell committed
364
           probability), optional (SxS) (default, random)
Steven Cordwell's avatar
Steven Cordwell committed
365 366 367 368 369
    
    Returns
    ----------
    P : transition probability matrix (SxSxA)
    R : reward matrix (SxSxA)
Steven Cordwell's avatar
Steven Cordwell committed
370 371 372 373 374 375

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)

Steven Cordwell's avatar
Steven Cordwell committed
376 377 378 379 380 381 382 383 384 385 386
    """
    if (S < 1 or A < 1):
        raise ValueError(mdperr["SA_gt_1"])
    
    try:
        if (mask != None) and ((mask.shape[0] != S) or (mask.shape[1] != S)):
            raise ValueError(mdperr["mask_SbyS"])
    except AttributeError:
        raise TypeError(mdperr["mask_numpy"])
    
    if mask == None:
Steven Cordwell's avatar
Steven Cordwell committed
387 388 389 390 391
        mask = rand(A, S, S)
        for a in range(A):
            r = random()
            mask[a][mask[a] < r] = 0
            mask[a][mask[a] >= r] = 1
Steven Cordwell's avatar
Steven Cordwell committed
392 393 394
    
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
Steven Cordwell's avatar
Steven Cordwell committed
395
        P = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
396
        # definition of reward matrix (values between -1 and +1)
Steven Cordwell's avatar
Steven Cordwell committed
397
        R = zeros((A, ), dtype=object)
Steven Cordwell's avatar
Steven Cordwell committed
398
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
399
            PP = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
400
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
401 402
                if (mask[a, s, :].sum() == 0):
                    PP[s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
403 404
                PP[s, :] = PP[s, :] / PP[s, :].sum()
            P[a] = sparse(PP)
Steven Cordwell's avatar
Steven Cordwell committed
405
            R[a] = sparse(mask[a] * (2 * rand(S, S) - ones((S, S))))
Steven Cordwell's avatar
Steven Cordwell committed
406 407 408 409 410 411
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
Steven Cordwell's avatar
Steven Cordwell committed
412
            P[a, :, :] = mask[a] * rand(S, S)
Steven Cordwell's avatar
Steven Cordwell committed
413
            for s in range(S):
Steven Cordwell's avatar
Steven Cordwell committed
414 415
                if (mask[a, s, :].sum() == 0):
                    P[a, s, randint(0, S - 1)] = 1
Steven Cordwell's avatar
Steven Cordwell committed
416
                P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
Steven Cordwell's avatar
Steven Cordwell committed
417
            R[a, :, :] = mask[a] * (2 * rand(S, S) - ones((S, S), dtype=int))
Steven Cordwell's avatar
Steven Cordwell committed
418 419 420
    
    return (P, R)

Steven Cordwell's avatar
Steven Cordwell committed
421
def getSpan(W):
422 423 424 425 426 427
    """Returns the span of W
    
    sp(W) = max W(s) - min W(s)
    """
    return (W.max() - W.min())

Steven Cordwell's avatar
Steven Cordwell committed
428
class MDP(object):
Steven Cordwell's avatar
Steven Cordwell committed
429
    """The Markov Decision Problem Toolbox."""
Steven Cordwell's avatar
Steven Cordwell committed
430
    
431
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
Steven Cordwell's avatar
Steven Cordwell committed
432
        """"""
433
        
Steven Cordwell's avatar
Steven Cordwell committed
434 435
        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
436
        if type(discount) in (int, float):
Steven Cordwell's avatar
Steven Cordwell committed
437 438 439
            if (discount <= 0) or (discount > 1):
                raise ValueError(mdperr["discount_rng"])
            else:
Steven Cordwell's avatar
Steven Cordwell committed
440 441 442
                if discount == 1:
                    print("PyMDPtoolbox WARNING: check conditions of convergence."\
                        "With no discount, convergence is not always assumed.")
Steven Cordwell's avatar
Steven Cordwell committed
443 444
                self.discount = discount
        elif not discount is None:
Steven Cordwell's avatar
Steven Cordwell committed
445 446
            raise ValueError("PyMDPtoolbox: the discount must be a positive " \
                "real number less than or equal to one.")
447
        
Steven Cordwell's avatar
Steven Cordwell committed
448 449
        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
450 451
        if type(max_iter) in (int, float):
            if max_iter <= 0:
Steven Cordwell's avatar
Steven Cordwell committed
452 453 454 455
                raise ValueError(mdperr["maxi_min"])
            else:
                self.max_iter = max_iter
        elif not max_iter is None:
Steven Cordwell's avatar
Steven Cordwell committed
456 457
            raise ValueError("PyMDPtoolbox: max_iter must be a positive real "\
                "number greater than zero.")
458
        
459 460 461 462 463 464 465
        if type(epsilon) in (int, float):
            if epsilon <= 0:
                raise ValueError("PyMDPtoolbox: epsilon must be greater than 0")
        elif not epsilon is None:
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\
                "number greater than zero.")
        
Steven Cordwell's avatar
Steven Cordwell committed
466 467 468
        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        check(transitions, reward)
469
        
Steven Cordwell's avatar
Steven Cordwell committed
470
        # computePR will assign the variables self.S, self.A, self.P and self.R
471 472
        self.computePR(transitions, reward)
        
Steven Cordwell's avatar
Steven Cordwell committed
473 474 475 476
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
477 478
        # set the initial iteration count to zero
        self.iter = 0
Steven Cordwell's avatar
Steven Cordwell committed
479
        
Steven Cordwell's avatar
Steven Cordwell committed
480
        self.V = None
Steven Cordwell's avatar
Steven Cordwell committed
481
        self.policy = None
Steven Cordwell's avatar
Steven Cordwell committed
482
    
483
    def bellmanOperator(self, V=None):
Steven Cordwell's avatar
Steven Cordwell committed
484 485
        """
        Applies the Bellman operator on the value function.
Steven Cordwell's avatar
Steven Cordwell committed
486
        
Steven Cordwell's avatar
Steven Cordwell committed
487 488
        Updates the value function and the Vprev-improving policy.
        
489
        Returns
Steven Cordwell's avatar
Steven Cordwell committed
490
        -------
491
        (policy, value) : tuple of new policy and its value
Steven Cordwell's avatar
Steven Cordwell committed
492
        """
493 494 495 496 497 498 499
        # this V should be a reference to the data rather than a copy
        if V == None:
            V = self.V
        else:
            if not ((type(V) in (ndarray, matrix)) and (V.shape == (self.S, 1))):
                raise ValueError("V in bellmanOperator needs to be correct.")
        
Steven Cordwell's avatar
Steven Cordwell committed
500 501
        Q = matrix(zeros((self.S, self.A)))
        for aa in range(self.A):
502
            Q[:, aa] = self.R[:, aa] + (self.discount * self.P[aa] * V)
Steven Cordwell's avatar
Steven Cordwell committed
503
        
504 505 506
        # Which way is better? if choose the first way, then the classes that
        # call this function must be changed
        # 1. Return, (policy, value)
507
        return (Q.argmax(axis=1), Q.max(axis=1))
Steven Cordwell's avatar
Steven Cordwell committed
508 509
        # 2. update self.policy and self.V directly
        # self.V = Q.max(axis=1)
510
        # self.policy = Q.argmax(axis=1)
Steven Cordwell's avatar
Steven Cordwell committed
511 512 513 514
    
    def computePR(self, P, R):
        """Computes the reward for the system in one state chosing an action
        
Steven Cordwell's avatar
Steven Cordwell committed
515
        Arguments
Steven Cordwell's avatar
Steven Cordwell committed
516
        ---------
Steven Cordwell's avatar
Steven Cordwell committed
517 518 519 520 521 522 523 524
        Let S = number of states, A = number of actions
            P(SxSxA)  = transition matrix 
                P could be an array with 3 dimensions or  a cell array (1xA), 
                each cell containing a matrix (SxS) possibly sparse
            R(SxSxA) or (SxA) = reward matrix
                R could be an array with 3 dimensions (SxSxA) or  a cell array 
                (1xA), each cell containing a sparse matrix (SxS) or a 2D 
                array(SxA) possibly sparse  
Steven Cordwell's avatar
Steven Cordwell committed
525
        Evaluation
Steven Cordwell's avatar
Steven Cordwell committed
526
        ----------
Steven Cordwell's avatar
Steven Cordwell committed
527 528
            PR(SxA)   = reward matrix
        """
Steven Cordwell's avatar
Steven Cordwell committed
529 530 531
        # we assume that P and R define a MDP i,e. assumption is that
        # check(P, R) has already been run and doesn't fail.
        
Steven Cordwell's avatar
Steven Cordwell committed
532
        # make P be an object array with (S, S) shaped array elements
533
        if (P.dtype is object):
Steven Cordwell's avatar
Steven Cordwell committed
534 535 536 537 538 539 540 541 542 543 544
            self.P = P
            self.A = self.P.shape[0]
            self.S = self.P[0].shape[0]
        else: # convert to an object array
            self.A = P.shape[0]
            self.S = P.shape[1]
            self.P = zeros(self.A, dtype=object)
            for aa in range(self.A):
                self.P[aa] = P[aa, :, :]
        
        # make R have the shape (S, A)
545 546
        if R.dtype is object:
            # R is object shaped (A,) with each element shaped (S, S)
Steven Cordwell's avatar
Steven Cordwell committed
547
            self.R = zeros((self.S, self.A))
548 549 550 551 552 553
            for aa in range(self.A):
                self.R[:, aa] = multiply(P[aa], R[aa]).sum(1)
        else:
            if R.ndim == 2:
                # R already has shape (S, A)
                self.R = R
Steven Cordwell's avatar
Steven Cordwell committed
554
            else:
555 556
                # R has shape (A, S, S)
                self.R = zeros((self.S, self.A))
Steven Cordwell's avatar
Steven Cordwell committed
557
                for aa in range(self.A):
Steven Cordwell's avatar
Steven Cordwell committed
558
                    self.R[:, aa] = multiply(P[aa], R[aa, :, :]).sum(1)
Steven Cordwell's avatar
Steven Cordwell committed
559 560 561 562 563 564 565
        
        # convert the arrays to numpy matrices
        for aa in range(self.A):
            if (type(self.P[aa]) is ndarray):
                self.P[aa] = matrix(self.P[aa])
        if (type(self.R) is ndarray):
            self.R = matrix(self.R)
566 567 568 569 570

    def iterate(self):
        """This is a placeholder method. Child classes should define their own
        iterate() method.
        """
Steven Cordwell's avatar
Steven Cordwell committed
571
        raise NotImplementedError("You should create an iterate() method.")
Steven Cordwell's avatar
Steven Cordwell committed
572
    
Steven Cordwell's avatar
Steven Cordwell committed
573 574 575 576 577 578 579
    def setSilent(self):
        """Ask for running resolution functions of the MDP Toolbox in silent
        mode.
        """
        self.verbose = False
    
    def setVerbose(self):
Steven Cordwell's avatar
Steven Cordwell committed
580 581 582
        """Ask for running resolution functions of the MDP Toolbox in verbose
        mode.
        """
Steven Cordwell's avatar
Steven Cordwell committed
583
        self.verbose = True
Steven Cordwell's avatar
Steven Cordwell committed
584 585

class FiniteHorizon(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
    """Reolution of finite-horizon MDP with backwards induction
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount factor, in ]0, 1]
    N        = number of periods, upper than 0
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    Evaluation
    ----------
    V(S,N+1)     = optimal value function
                 V(:,n) = optimal value function at stage n
                        with stage in 1, ..., N
                        V(:,N+1) = value function for terminal stage 
    policy(S,N)  = optimal policy
                 policy(:,n) = optimal policy at stage n
                        with stage in 1, ...,N
                        policy(:,N) = policy for stage N
    cpu_time = used CPU time
  
    Notes
    -----
    In verbose mode, displays the current stage and policy transpose.

Steven Cordwell's avatar
Steven Cordwell committed
617
    """
Steven Cordwell's avatar
Steven Cordwell committed
618

Steven Cordwell's avatar
Steven Cordwell committed
619 620
    def __init__(self, transitions, reward, discount, N, h=None):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
621
        if N < 1:
Steven Cordwell's avatar
Steven Cordwell committed
622
            raise ValueError('PyMDPtoolbox: N must be greater than 0')
Steven Cordwell's avatar
Steven Cordwell committed
623
        else:
Steven Cordwell's avatar
Steven Cordwell committed
624
            self.N = N
Steven Cordwell's avatar
Steven Cordwell committed
625
        
626 627 628 629 630 631
        MDP.__init__(self, transitions, reward, discount, None, None)
        
        # remove the iteration counter
        del self.iter
        
        self.V = zeros((self.S, N + 1))
Steven Cordwell's avatar
Steven Cordwell committed
632
        
633
        self.policy = zeros((self.S, N), dtype=int)
Steven Cordwell's avatar
Steven Cordwell committed
634
        
Steven Cordwell's avatar
Steven Cordwell committed
635
        if not h is None:
636
            self.V[:, N] = h
Steven Cordwell's avatar
Steven Cordwell committed
637
        
Steven Cordwell's avatar
Steven Cordwell committed
638 639
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
640 641
        self.time = time()
        
642 643 644 645
        for n in range(self.N):
            W, X = self.bellmanOperator(matrix(self.V[:, self.N - n]).reshape(self.S, 1))
            self.V[:, self.N - n - 1] = X.A1
            self.policy[:, self.N - n - 1] = W.A1
Steven Cordwell's avatar
Steven Cordwell committed
646
            if self.verbose:
647
                print("stage: %s ... policy transpose : %s") % (self.N - n, self.policy[:, self.N - n -1].tolist())
Steven Cordwell's avatar
Steven Cordwell committed
648 649
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
650 651

class LP(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
    """Resolution of discounted MDP with linear programming

    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0; 1[
    h(S)     = terminal reward, optional (default [0; 0; ... 0] )
    
    Evaluation
    ----------
    V(S)   = optimal values
    policy(S) = optimal policy
    cpu_time = used CPU time
    
    Notes    
    -----
    In verbose mode, displays the current stage and policy transpose.
    
    Examples
    --------
Steven Cordwell's avatar
Steven Cordwell committed
679
    """
Steven Cordwell's avatar
Steven Cordwell committed
680

Steven Cordwell's avatar
Steven Cordwell committed
681 682
    def __init__(self, transitions, reward, discount):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
683 684 685
        
        try:
            from cvxopt import matrix, solvers
Steven Cordwell's avatar
Steven Cordwell committed
686
            self.linprog = solvers.lp
Steven Cordwell's avatar
Steven Cordwell committed
687
        except ImportError:
Steven Cordwell's avatar
Steven Cordwell committed
688 689
            raise ImportError("The python module cvxopt is required to use " \
                "linear programming functionality.")
Steven Cordwell's avatar
Steven Cordwell committed
690
        
Steven Cordwell's avatar
Steven Cordwell committed
691
        from scipy.sparse import eye as speye
Steven Cordwell's avatar
Steven Cordwell committed
692

Steven Cordwell's avatar
Steven Cordwell committed
693
        MDP.__init__(self, transitions, reward, discount, None)
Steven Cordwell's avatar
Steven Cordwell committed
694 695 696 697 698 699 700
        
        # The objective is to resolve : min V / V >= PR + discount*P*V
        # The function linprog of the optimisation Toolbox of Mathworks resolves :
        # min f'* x / M * x <= b
        # So the objective could be expressed as : min V / (discount*P-I) * V <= - PR
        # To avoid loop on states, the matrix M is structured following actions M(A*S,S)
    
Steven Cordwell's avatar
Steven Cordwell committed
701
        self.f = ones(self.S, 1)
Steven Cordwell's avatar
Steven Cordwell committed
702
    
Steven Cordwell's avatar
Steven Cordwell committed
703 704 705 706
        self.M = zeros((self.A * self.S, self.S))
        for aa in range(self.A):
            pos = (aa + 1) * self.S
            self.M[(pos - self.S):pos, :] = discount * self.P[aa] - speye(self.S, self.S)
Steven Cordwell's avatar
Steven Cordwell committed
707
        
Steven Cordwell's avatar
Steven Cordwell committed
708 709 710 711
        self.M = matrix(self.M)
    
    def iterate(self):
        """"""
Steven Cordwell's avatar
Steven Cordwell committed
712 713
        self.time = time()
        
Steven Cordwell's avatar
Steven Cordwell committed
714
        self.V = self.linprog(self.f, self.M, -self.R)
Steven Cordwell's avatar
Steven Cordwell committed
715
        
Steven Cordwell's avatar
Steven Cordwell committed
716
        self.V, self.policy =  self.bellmanOperator(self.P, self.R, self.discount, self.V)
Steven Cordwell's avatar
Steven Cordwell committed
717 718
        
        self.time = time() - self.time
Steven Cordwell's avatar
Steven Cordwell committed
719 720 721

class PolicyIteration(MDP):
    """Resolution of discounted MDP with policy iteration algorithm.
Steven Cordwell's avatar
Steven Cordwell committed
722
    
Steven Cordwell's avatar
Steven Cordwell committed
723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
             
    Evaluation
    ----------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
Steven Cordwell's avatar
Steven Cordwell committed
753 754 755 756 757 758 759
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    >>> pi = mdp.PolicyIteration(P, R, 0.9)
    >>> pi.iterate()
    
Steven Cordwell's avatar
Steven Cordwell committed
760
    """
Steven Cordwell's avatar
Steven Cordwell committed
761
    
Steven Cordwell's avatar
Steven Cordwell committed
762
    def __init__(self, transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0):
Steven Cordwell's avatar
Steven Cordwell committed
763 764
        """"""
        
765
        MDP.__init__(self, transitions, reward, discount, None, max_iter)
Steven Cordwell's avatar
Steven Cordwell committed
766
        
Steven Cordwell's avatar
Steven Cordwell committed
767 768 769
        if policy0 == None:
            # initialise the policy to the one which maximises the expected
            # immediate reward
Steven Cordwell's avatar
Steven Cordwell committed
770
            self.V = matrix(zeros((self.S, 1)))
771 772
            self.policy, null = self.bellmanOperator()
            del null
Steven Cordwell's avatar
Steven Cordwell committed
773 774 775 776 777 778 779 780 781
        else:
            policy0 = array(policy0)
            
            if not policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)):
                raise ValueError('PyMDPtolbox: policy0 must a vector with length S')
            
            policy0 = matrix(policy0.reshape(self.S, 1))
            
            if mod(policy0, 1).any() or (policy0 < 0).any() or (policy0 >= self.S).any():
Steven Cordwell's avatar
Steven Cordwell committed
782
                raise ValueError('PyMDPtoolbox: policy0 must be a vector of integers between 1 and S')
Steven Cordwell's avatar
Steven Cordwell committed
783 784 785 786
            else:
                self.policy = policy0
        
        # set or reset the initial values to zero
Steven Cordwell's avatar
Steven Cordwell committed
787
        self.V = matrix(zeros((self.S, 1)))
Steven Cordwell's avatar
Steven Cordwell committed
788
        
Steven Cordwell's avatar
Steven Cordwell committed
789
        if eval_type in (0, "matrix"):
790
            from numpy.linalg import solve
791 792
            from scipy.sparse import eye
            self.speye = eye
793
            self.lin_eq = solve
Steven Cordwell's avatar
Steven Cordwell committed
794 795 796 797 798 799 800 801
            self.eval_type = "matrix"
        elif eval_type in (1, "iterative"):
            self.eval_type = "iterative"
        else:
            raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "\
                "evaluation or 1 for iterative evaluation. strings 'matrix' " \
                "and 'iterative' can also be used.")
    
Steven Cordwell's avatar
Steven Cordwell committed
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
    def computePpolicyPRpolicy(self):
        """Computes the transition matrix and the reward matrix for a policy
        
        Arguments
        ---------
        Let S = number of states, A = number of actions
        P(SxSxA)  = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
        R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
        policy(S) = a policy
        
        Evaluation
        ----------
        Ppolicy(SxS)  = transition matrix for policy
        PRpolicy(S)   = reward matrix for policy
        """
        Ppolicy = matrix(zeros((self.S, self.S)))
        Rpolicy = matrix(zeros((self.S, 1)))
        for aa in range(self.A): # avoid looping over S
        
            # the rows that use action a. .getA1() is used to make sure that
            # ind is a 1 dimensional vector
828
            ind = (self.policy == aa).nonzero()[0].getA1()
Steven Cordwell's avatar
Steven Cordwell committed
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
            if ind.size > 0: # if no rows use action a, then no point continuing
                Ppolicy[ind, :] = self.P[aa][ind, :]
                
                #PR = self.computePR() # an apparently uneeded line, and
                # perhaps harmful in this implementation c.f.
                # mdp_computePpolicyPRpolicy.m
                Rpolicy[ind] = self.R[ind, aa]
        
        # self.R cannot be sparse with the code in its current condition, but
        # it should be possible in the future. Also, if R is so big that its
        # a good idea to use a sparse matrix for it, then converting PRpolicy
        # from a dense to sparse matrix doesn't seem very memory efficient
        if type(self.R) is sparse:
            Rpolicy = sparse(Rpolicy)
        
        #self.Ppolicy = Ppolicy
        #self.Rpolicy = Rpolicy
        return (Ppolicy, Rpolicy)
    
Steven Cordwell's avatar
Steven Cordwell committed
848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
    def evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
        """Policy evaluation using iteration
        
        Arguments
        ---------
        Let S = number of states, A = number of actions
        P(SxSxA)  = transition matrix 
            P could be an array with 3 dimensions or 
            a cell array (1xS), each cell containing a matrix possibly sparse
        R(SxSxA) or (SxA) = reward matrix
            R could be an array with 3 dimensions (SxSxA) or 
            a cell array (1xA), each cell containing a sparse matrix (SxS) or
            a 2D array(SxA) possibly sparse  
        discount  = discount rate in ]0; 1[
        policy(S) = a policy
        V0(S)     = starting value function, optional (default : zeros(S,1))
        epsilon   = epsilon-optimal policy search, upper than 0,
            optional (default : 0.0001)
        max_iter  = maximum number of iteration to be done, upper than 0, 
            optional (default : 10000)
            
        Evaluation
        ----------
        Vpolicy(S) = value function, associated to a specific policy
        
        Notes
        -----
        In verbose mode, at each iteration, displays the condition which stopped iterations:
        epsilon-optimum value function found or maximum number of iterations reached.
        """
878
        if (type(V0) in (int, float)) and (V0 == 0):
879
            policy_V = zeros((self.S, 1))
Steven Cordwell's avatar
Steven Cordwell committed
880
        else:
881 882 883 884
            if (type(V0) in (ndarray, matrix)) and (V0.shape == (self.S, 1)):
                policy_V = V0
            else:
                raise ValueError('PyMDPtoolbox: V0 vector/array type not supported. Use ndarray of matrix column vector length S.')
Steven Cordwell's avatar
Steven Cordwell committed
885
        
886
        policy_P, policy_R = self.computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
887 888 889
        
        if self.verbose:
            print('  Iteration    V_variation')
890
        
Steven Cordwell's avatar
Steven Cordwell committed
891 892 893 894
        itr = 0
        done = False
        while not done:
            itr = itr + 1
895 896 897 898 899
            
            Vprev = policy_V
            policy_V = policy_R + self.discount * policy_P * Vprev
            
            variation = absolute(policy_V - Vprev).max()
Steven Cordwell's avatar
Steven Cordwell committed
900 901
            if self.verbose:
                print('      %s         %s') % (itr, variation)
902
                
Steven Cordwell's avatar
Steven Cordwell committed
903 904 905
            if variation < ((1 - self.discount) / self.discount) * epsilon: # to ensure |Vn - Vpolicy| < epsilon
                done = True
                if self.verbose:
906
                    print('PyMDPtoolbox: iterations stopped, epsilon-optimal value function')
Steven Cordwell's avatar
Steven Cordwell committed
907 908 909
            elif itr == max_iter:
                done = True
                if self.verbose:
910
                    print('PyMDPtoolbox: iterations stopped by maximum number of iteration condition')
Steven Cordwell's avatar
Steven Cordwell committed
911
        
Steven Cordwell's avatar
Steven Cordwell committed
912
        self.V = policy_V
913 914
    
    def evalPolicyMatrix(self):
Steven Cordwell's avatar
Steven Cordwell committed
915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
        """Evaluation of the value function of a policy
        
        Arguments 
        ---------
        Let S = number of states, A = number of actions
        P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
        R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
        discount = discount rate in ]0; 1[
        policy(S) = a policy
        
        Evaluation
        ----------
        Vpolicy(S) = value function of the policy
        """
        
935
        Ppolicy, Rpolicy = self.computePpolicyPRpolicy()
Steven Cordwell's avatar
Steven Cordwell committed
936
        # V = PR + gPV  => (I-gP)V = PR  => V = inv(I-gP)* PR
937
        self.V = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
Steven Cordwell's avatar
Steven Cordwell committed
938 939
    
    def iterate(self):
940
        """Run the policy iteration algorithm."""
Steven Cordwell's avatar
Steven Cordwell committed
941
        
942 943 944
        if self.verbose:
            print('  Iteration  Number_of_different_actions')
        
Steven Cordwell's avatar
Steven Cordwell committed
945
        done = False
946
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
947 948
        
        while not done:
949 950
            self.iter = self.iter + 1
            
Steven Cordwell's avatar
Steven Cordwell committed
951 952
            # these evalPolicy* functions will update the classes value
            # attribute
Steven Cordwell's avatar
Steven Cordwell committed
953 954 955 956
            if self.eval_type == "matrix":
                self.evalPolicyMatrix()
            elif self.eval_type == "iterative":
                self.evalPolicyIterative()
957
            
Steven Cordwell's avatar
Steven Cordwell committed
958 959
            # This should update the classes policy attribute but leave the
            # value alone
960 961
            policy_next, null = self.bellmanOperator()
            del null
Steven Cordwell's avatar
Steven Cordwell committed
962
            
963
            n_different = (policy_next != self.policy).sum()
964 965 966 967
            
            if self.verbose:
                print('       %s                 %s') % (self.iter, n_different)
            
968
            if n_different == 0:
Steven Cordwell's avatar
Steven Cordwell committed
969
                done = True
970 971 972 973 974 975 976 977
                if self.verbose:
                    print("...iterations stopped, unchanging policy found")
            elif (self.iter == self.max_iter):
                done = True 
                if self.verbose:
                    print("...iterations stopped by maximum number of iteration condition")
            else:
                self.policy = policy_next
Steven Cordwell's avatar
Steven Cordwell committed
978
        
979 980
        self.time = time() - self.time
        
Steven Cordwell's avatar
Steven Cordwell committed
981
        # store value and policy as tuples
982 983
        self.V = tuple(self.V.getA1().tolist())
        self.policy = tuple(self.policy.getA1().tolist())
Steven Cordwell's avatar
Steven Cordwell committed
984

985
class PolicyIterationModified(PolicyIteration):
Steven Cordwell's avatar
Steven Cordwell committed
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
    """Resolution of discounted MDP  with policy iteration algorithm
    
    Arguments
    ---------
    Let S = number of states, A = number of actions
    P(SxSxA) = transition matrix 
             P could be an array with 3 dimensions or 
             a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
    R(SxSxA) or (SxA) = reward matrix
             R could be an array with 3 dimensions (SxSxA) or 
             a cell array (1xA), each cell containing a sparse matrix (SxS) or
             a 2D array(SxA) possibly sparse  
    discount = discount rate, in ]0, 1[
    policy0(S) = starting policy, optional 
    max_iter = maximum number of iteration to be done, upper than 0, 
             optional (default 1000)
    eval_type = type of function used to evaluate policy: 
             0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
             optional (default 0)
    
    Data Attributes
    ---------------
    V(S)   = value function 
    policy(S) = optimal policy
    iter     = number of done iterations
    cpu_time = used CPU time
    
    Notes
    -----
    In verbose mode, at each iteration, displays the number 
    of differents actions between policy n-1 and n
    
    Examples
    --------
    >>> import mdp
Steven Cordwell's avatar
Steven Cordwell committed
1021
    """
1022 1023
    
    def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10):
Steven Cordwell's avatar
Steven Cordwell committed
1024 1025
        """"""
        
1026 1027 1028 1029 1030
        # Maybe its better not to subclass from PolicyIteration, because the
        # initialisation of the two are quite different. eg there is policy0
        # being calculated here which doesn't need to be. The only thing that
        # is needed from the PolicyIteration class is the evalPolicyIterative
        # function. Perhaps there is a better way to do it?
1031
        PolicyIteration.__init__(self, transitions, reward, discount, None, max_iter, 1)
1032
        
1033 1034 1035 1036 1037 1038 1039 1040
        # PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
        # check it here
        if type(epsilon) in (int, float):
            if epsilon <= 0:
                raise ValueError("PyMDPtoolbox: epsilon must be greater than 0")
        else:
            raise ValueError("PyMDPtoolbox: epsilon must be a positive real "\
                "number greater than zero.")
1041 1042 1043 1044 1045 1046 1047
        
        # computation of threshold of variation for V for an epsilon-optimal policy
        if self.discount != 1:
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:
            self.thresh = epsilon
        
1048 1049
        self.epsilon = epsilon
        
1050
        if discount == 1:
Steven Cordwell's avatar
Steven Cordwell committed
1051
            self.V = matrix(zeros((self.S, 1)))
1052 1053
        else:
            # min(min()) is not right
1054
            self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
Steven Cordwell's avatar
Steven Cordwell committed
1055
    
1056 1057 1058 1059 1060 1061
    def iterate(self):
        """"""
        
        if self.verbose:
            print('  Iteration  V_variation')
        
Steven Cordwell's avatar
Steven Cordwell committed
1062
        self.time = time()
Steven Cordwell's avatar
Steven Cordwell committed
1063
        
1064 1065 1066
        done = False
        while not done:
            self.iter = self.iter + 1
1067
            
1068
            self.policy, Vnext = self.bellmanOperator()
1069
            #[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
1070
            
1071
            variation = getSpan(Vnext - self.V)
1072 1073 1074
            if self.verbose:
                print("      %s         %s" % (self.iter, variation))
            
Steven Cordwell's avatar
Steven Cordwell committed
1075
            self.V = Vnext
Steven Cordwell's avatar
Steven Cordwell committed
1076
            if variation < self.thresh:
1077 1078 1079 1080
                done = True
            else:
                is_verbose = False
                if self.verbose:
1081
                    self.setSilent()
1082 1083
                    is_verbose = True
                
1084
                self.evalPolicyIterative(self.V, self.epsilon, self.max_iter)
1085 1086
                
                if is_verbose:
1087
                    self.setVerbose()
1088
        
1089
        self.time = time() - self.time
1090 1091 1092 1093
        
        # store value and policy as tuples
        self.V = tuple(self.V.getA1().tolist())
        self.policy = tuple(self.policy.getA1().tolist())
Steven Cordwell's avatar
Steven Cordwell committed
1094 1095

class QLearning(MDP):
Steven Cordwell's avatar
Steven Cordwell committed
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
    """Evaluates the matrix Q, using the Q learning algorithm.
    
    Let S = number of states, A = number of actions
    
    Parameters
    ----------
    P : transition matrix (SxSxA)
        P could be an array with 3 dimensions or a cell array (1xA), each
        cell containing a sparse matrix (SxS)
    R : reward matrix(SxSxA) or (SxA)
        R could be an array with 3 dimensions (SxSxA) or a cell array
        (1xA), each cell containing a sparse matrix (SxS) or a 2D
        array(SxA) possibly sparse
    discount : discount rate
        in ]0; 1[    
    n_iter : number of iterations to execute (optional).
        Default value = 10000; it is an integer greater than the default value.
    
    Results
    -------
    Q : learned Q matrix (SxA) 
    
Steven Cordwell's avatar
Steven Cordwell committed
1118
    V : learned value function (S).
Steven Cordwell's avatar
Steven Cordwell committed
1119 1120 1121 1122 1123 1124 1125
    
    policy : learned optimal policy (S).
    
    mean_discrepancy : vector of V discrepancy mean over 100 iterations
        Then the length of this vector for the default value of N is 100 
        (N/100).

1126
    Examples
Steven Cordwell's avatar
Steven Cordwell committed
1127
    ---------
1128
    >>> import random # this example is reproducible only if random seed is set
Steven Cordwell's avatar
Steven Cordwell committed
1129
    >>> import mdp
1130
    >>> random.seed(0)
Steven Cordwell's avatar
Steven Cordwell committed
1131 1132 1133 1134
    >>> P, R = mdp.exampleForest()
    >>> ql = mdp.QLearning(P, R, 0.96)
    >>> ql.iterate()
    >>> ql.Q
1135 1136 1137
    array([[ 68.80977389,  46.62560314],
           [ 72.58265749,  43.1170545 ],
           [ 77.1332834 ,  65.01737419]])
Steven Cordwell's avatar
Steven Cordwell committed
1138
    >>> ql.V
1139
    (68.80977388561172, 72.5826574913828, 77.13328339600116)
Steven Cordwell's avatar
Steven Cordwell committed
1140
    >>> ql.policy
1141
    (0, 0, 0)
Steven Cordwell's avatar
Steven Cordwell committed
1142
    
1143
    >>> import random # this example is reproducible only if random seed is set
Steven Cordwell's avatar
Steven Cordwell committed
1144 1145 1146 1147
    >>> import mdp
    >>> import numpy as np
    >>> P = np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    >>> R = np.array([[5, 10], [-1, 2]])
1148
    >>> random.seed(0)
Steven Cordwell's avatar
Steven Cordwell committed
1149 1150 1151
    >>> ql = mdp.QLearning(P, R, 0.9)
    >>> ql.iterate()
    >>> ql.Q
1152 1153
    array([[ 36.63245946,  42.24434307],
           [ 35.96582807,  32.70456417]])
Steven Cordwell's avatar
Steven Cordwell committed
1154
    >>> ql.V
1155
    (42.24434307022128, 35.96582807367007)
Steven Cordwell's avatar
Steven Cordwell committed
1156
    >>> ql.policy
1157
    (1, 0)
Steven Cordwell's avatar
Steven Cordwell committed
1158
    
Steven Cordwell's avatar
Steven Cordwell committed
1159 1160 1161 1162 1163 1164
    """
    
    def __init__(self, transitions, reward, discount, n_iter=10000