example.py 13.6 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5
"""Markov Decision Process (MDP) Toolbox: ``example`` module
=========================================================

The ``example`` module provides functions to generate valid MDP transition and
6
reward matrices.
7 8 9 10 11 12 13

Available functions
-------------------
forest
    A simple forest management example
rand
    A random example
14 15 16

"""

17
# Copyright (c) 2011-2014 Steven A. W. Cordwell
18
# Copyright (c) 2009 INRA
19
#
20
# All rights reserved.
21
#
22 23
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
24
#
25 26 27 28 29 30 31 32
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
33
#
34 35 36 37 38 39 40 41 42 43 44 45
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

46 47
import numpy as _np
import scipy.sparse as _sp
48

49
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
50
    """Generate a MDP example based on a simple forest management scenario.
51

52 53 54 55 56 57
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
58

59
    Here is how the problem is modelled.
60 61 62 63
    Let {0, 1 . . . ``S``-1 } be the states of the forest, with ``S``-1 being
    the oldest. Let 'Wait' be action 0 and 'Cut' be action 1.
    After a fire, the forest is in the youngest state, that is state 0.
    The transition matrix ``P`` of the problem can then be defined as follows::
64

65 66
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
67
        P[0,:,:] = | .  .  0  .       |
68 69 70
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
71

72 73
                   | 1 0..........0 |
                   | . .          . |
74
        P[1,:,:] = | . .          . |
75 76 77
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
78

79
    The reward matrix R is defined as follows::
80

81 82
                 |  0  |
                 |  .  |
83
        R[:,0] = |  .  |
84 85 86
                 |  .  |
                 |  0  |
                 |  r1 |
87

88 89
                 |  0  |
                 |  1  |
90
        R[:,1] = |  .  |
91 92 93
                 |  .  |
                 |  1  |
                 |  r2 |
94

95 96 97
    Parameters
    ---------
    S : int, optional
98 99
        The number of states, which should be an integer greater than 1.
        Default: 3.
100 101
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
102
        performed. Default: 4.
103 104
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
105
        performed. Default: 2.
106
    p : float, optional
107 108 109 110 111
        The probability of wild fire occurence, in the range ]0, 1[. Default:
        0.1.
    is_sparse : bool, optional
        If True, then the probability transition matrices will be returned in
        sparse format, otherwise they will be in dense format. Default: False.
112

113 114 115
    Returns
    -------
    out : tuple
116 117 118 119 120 121
        ``out[0]`` contains the transition probability matrix P  and ``out[1]``
        contains the reward matrix R. If ``is_sparse=False`` then P is a numpy
        array with a shape of ``(A, S, S)`` and R is a numpy array with a shape
        of ``(S, A)``. If ``is_sparse=True`` then P is a tuple of length ``A``
        where each ``P[a]`` is a scipy sparse CSR format matrix of shape
        ``(S, S)``; R remains the same as in the case of ``is_sparse=False``.
122

123 124
    Examples
    --------
125 126
    >>> import mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
127 128 129 130 131 132 133 134 135 136 137 138
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
    <BLANKLINE>
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
    >>> Psp, Rsp = mdptoolbox.example.forest(is_sparse=True)
    >>> len(Psp)
    2
    >>> Psp[0]
    <3x3 sparse matrix of type '<type 'numpy.float64'>'
        with 6 stored elements in Compressed Sparse Row format>
    >>> Psp[1]
    <3x3 sparse matrix of type '<type 'numpy.int64'>'
        with 3 stored elements in Compressed Sparse Row format>
    >>> Rsp
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    >>> (Psp[0].todense() == P[0]).all()
    True
    >>> (Rsp == R).all()
    True
156

157
    """
158 159 160
    assert S > 1, "The number of states S must be greater than 1."
    assert (r1 > 0) and (r2 > 0), "The rewards must be non-negative."
    assert 0 <= p <= 1, "The probability p must be in [0; 1]."
161
    # Definition of Transition matrix
162 163
    if is_sparse:
        P = []
Steven Cordwell's avatar
Steven Cordwell committed
164 165
        rows = list(range(S)) * 2
        cols = [0] * S + list(range(1, S)) + [S - 1]
166
        vals = [p] * S + [1-p] * S
167
        P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
Steven Cordwell's avatar
Steven Cordwell committed
168
        rows = list(range(S))
169 170
        cols = [0] * S
        vals = [1] * S
171
        P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
172
    else:
173 174
        P = _np.zeros((2, S, S))
        P[0, :, :] = (1 - p) * _np.diag(_np.ones(S - 1), 1)
175 176
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
177
        P[1, :, :] = _np.zeros((S, S))
178
        P[1, :, 0] = 1
179
    # Definition of Reward matrix
180
    R = _np.zeros((S, 2))
181
    R[S - 1, 0] = r1
182
    R[:, 1] = _np.ones(S)
183 184 185
    R[0, 1] = 0
    R[S - 1, 1] = r2
    # we want to return the generated transition and reward matrices
186
    return(P, R)
187

188
def rand(S, A, is_sparse=False, mask=None):
189
    """Generate a random Markov Decision Process.
190

191 192 193
    Parameters
    ----------
    S : int
194
        Number of states (> 1)
195
    A : int
196 197 198 199 200 201 202
        Number of actions (> 1)
    is_sparse : bool, optional
        False to have matrices in dense format, True to have sparse matrices.
        Default: False.
    mask : array, optional
        Array with 0 and 1 (0 indicates a place for a zero probability), shape
        can be ``(S, S)`` or ``(A, S, S)``. Default: random.
203

204 205 206
    Returns
    -------
    out : tuple
207 208 209 210 211 212 213
        ``out[0]`` contains the transition probability matrix P  and ``out[1]``
        contains the reward matrix R. If ``is_sparse=False`` then P is a numpy
        array with a shape of ``(A, S, S)`` and R is a numpy array with a shape
        of ``(S, A)``. If ``is_sparse=True`` then P and R are tuples of length
        ``A``, where each ``P[a]`` is a scipy sparse CSR format matrix of shape
        ``(S, S)`` and each ``R[a]`` is a scipy sparse csr format matrix of
        shape ``(S, 1)``.
214

215 216
    Examples
    --------
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
    >>> import numpy, mdptoolbox.example
    >>> numpy.random.seed(0) # Needed to get the output below
    >>> P, R = mdptoolbox.example.rand(4, 3)
    >>> P
    array([[[ 0.21977283,  0.14889403,  0.30343592,  0.32789723],
            [ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.        ,  0.43718772,  0.54480359,  0.01800869],
            [ 0.39766289,  0.39997167,  0.12547318,  0.07689227]],
    <BLANKLINE>
           [[ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.32261337,  0.15483812,  0.32271303,  0.19983549],
            [ 0.33816885,  0.2766999 ,  0.12960299,  0.25552826],
            [ 0.41299411,  0.        ,  0.58369957,  0.00330633]],
    <BLANKLINE>
           [[ 0.32343037,  0.15178596,  0.28733094,  0.23745272],
            [ 0.36348538,  0.24483321,  0.16114188,  0.23053953],
            [ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.        ,  0.        ,  1.        ,  0.        ]]])
    >>> R
    array([[[-0.23311696,  0.58345008,  0.05778984,  0.13608912],
            [-0.07704128,  0.        , -0.        ,  0.        ],
            [ 0.        ,  0.22419145,  0.23386799,  0.88749616],
            [-0.3691433 , -0.27257846,  0.14039354, -0.12279697]],
    <BLANKLINE>
           [[-0.77924972,  0.        , -0.        , -0.        ],
            [ 0.47852716, -0.92162442, -0.43438607, -0.75960688],
            [-0.81211898,  0.15189299,  0.8585924 , -0.3628621 ],
            [ 0.35563307, -0.        ,  0.47038804,  0.92437709]],
    <BLANKLINE>
           [[-0.4051261 ,  0.62759564, -0.20698852,  0.76220639],
            [-0.9616136 , -0.39685037,  0.32034707, -0.41984479],
            [-0.13716313,  0.        , -0.        , -0.        ],
            [ 0.        , -0.        ,  0.55810204,  0.        ]]])
    >>> numpy.random.seed(0) # Needed to get the output below
    >>> Psp, Rsp = mdptoolbox.example.rand(100, 5, is_sparse=True)
    >>> len(Psp), len(Rsp)
    (5, 5)
    >>> Psp[0]
    <100x100 sparse matrix of type '<type 'numpy.float64'>'
        with 3296 stored elements in Compressed Sparse Row format>
    >>> Rsp[0]
    <100x100 sparse matrix of type '<type 'numpy.float64'>'
        with 3296 stored elements in Compressed Sparse Row format>
    >>> # The number of non-zero elements (nnz) in P and R are equal
    >>> Psp[1].nnz == Rsp[1].nnz
    True
263

264 265
    """
    # making sure the states and actions are more than one
266 267
    assert S > 1, "The number of states S must be greater than 1."
    assert A > 1, "The number of actions A must be greater than 1."
268 269 270 271
    # if the user hasn't specified a mask, then we will make a random one now
    if mask is not None:
        # the mask needs to be SxS or AxSxS
        try:
272 273
            assert mask.shape in ((S, S), (A, S, S)), "'mask' must have " \
            "dimensions S×S or A×S×S."
274
        except AttributeError:
275
            raise TypeError("'mask' must be a numpy array or matrix.")
276 277 278 279 280 281
    # generate the transition and reward matrices based on S, A and mask
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
        P = [None] * A
        # definition of reward matrix (values between -1 and +1)
        R = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
282
        for a in range(A):
283 284 285
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
286 287
            PP = _sp.dok_matrix((S, S))
            RR = _sp.dok_matrix((S, S))
Steven Cordwell's avatar
Steven Cordwell committed
288
            for s in range(S):
289
                if mask is None:
290
                    m = _np.random.random(S)
291 292 293 294 295 296 297 298
                    m[m <= 2/3.0] = 0
                    m[m > 2/3.0] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                n = int(m.sum()) # m[s, :]
                if n == 0:
299
                    m[_np.random.randint(0, S)] = 1
300
                    n = 1
301 302 303 304 305 306
                # find the columns of the vector that have non-zero elements
                nz = m.nonzero()
                if len(nz) == 1:
                    cols = nz[0]
                else:
                    cols = nz[1]
307
                vals = _np.random.random(n)
308
                vals = vals / vals.sum()
309
                reward = 2*_np.random.random(n) - _np.ones(n)
310 311 312 313 314 315 316 317 318
                PP[s, cols] = vals
                RR[s, cols] = reward
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
    else:
        # definition of transition matrix : square stochastic matrix
319
        P = _np.zeros((A, S, S))
320
        # definition of reward matrix (values between -1 and +1)
321
        R = _np.zeros((A, S, S))
322 323 324 325
        for a in range(A):
            for s in range(S):
                # create our own random mask if there is no user supplied one
                if mask is None:
326 327
                    m = _np.random.random(S)
                    r = _np.random.random()
328 329 330 331 332 333 334 335
                    m[m <= r] = 0
                    m[m > r] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                # Make sure that there is atleast one transition in each state
                if m.sum() == 0:
336 337
                    m[_np.random.randint(0, S)] = 1
                P[a][s] = m * _np.random.random(S)
338
                P[a][s] = P[a][s] / P[a][s].sum()
339 340
                R[a][s] = (m * (2 * _np.random.random(S) -
                                _np.ones(S, dtype=int)))
341
    # we want to return the generated transition and reward matrices
342
    return(P, R)