example.py 13.6 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
3
4
5
"""Markov Decision Process (MDP) Toolbox: ``example`` module
=========================================================

The ``example`` module provides functions to generate valid MDP transition and
6
reward matrices.
7
8
9
10
11
12
13

Available functions
-------------------
forest
    A simple forest management example
rand
    A random example
14
15
16

"""

17
# Copyright (c) 2011-2014 Steven A. W. Cordwell
18
# Copyright (c) 2009 INRA
19
#
20
# All rights reserved.
21
#
22
23
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
24
#
25
26
27
28
29
30
31
32
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
33
#
34
35
36
37
38
39
40
41
42
43
44
45
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

46
from numpy import diag, ones, where, zeros
47
from numpy.random import randint, random
48
49
from scipy.sparse import coo_matrix, dok_matrix

50
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
51
    """Generate a MDP example based on a simple forest management scenario.
52

53
54
55
56
57
58
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
59

60
    Here is how the problem is modelled.
61
62
63
64
    Let {0, 1 . . . ``S``-1 } be the states of the forest, with ``S``-1 being
    the oldest. Let 'Wait' be action 0 and 'Cut' be action 1.
    After a fire, the forest is in the youngest state, that is state 0.
    The transition matrix ``P`` of the problem can then be defined as follows::
65

66
67
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
68
        P[0,:,:] = | .  .  0  .       |
69
70
71
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
72

73
74
                   | 1 0..........0 |
                   | . .          . |
75
        P[1,:,:] = | . .          . |
76
77
78
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
79

80
    The reward matrix R is defined as follows::
81

82
83
                 |  0  |
                 |  .  |
84
        R[:,0] = |  .  |
85
86
87
                 |  .  |
                 |  0  |
                 |  r1 |
88

89
90
                 |  0  |
                 |  1  |
91
        R[:,1] = |  .  |
92
93
94
                 |  .  |
                 |  1  |
                 |  r2 |
95

96
97
98
    Parameters
    ---------
    S : int, optional
99
100
        The number of states, which should be an integer greater than 1.
        Default: 3.
101
102
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
103
        performed. Default: 4.
104
105
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
106
        performed. Default: 2.
107
    p : float, optional
108
109
110
111
112
        The probability of wild fire occurence, in the range ]0, 1[. Default:
        0.1.
    is_sparse : bool, optional
        If True, then the probability transition matrices will be returned in
        sparse format, otherwise they will be in dense format. Default: False.
113

114
115
116
    Returns
    -------
    out : tuple
117
118
119
120
121
122
        ``out[0]`` contains the transition probability matrix P  and ``out[1]``
        contains the reward matrix R. If ``is_sparse=False`` then P is a numpy
        array with a shape of ``(A, S, S)`` and R is a numpy array with a shape
        of ``(S, A)``. If ``is_sparse=True`` then P is a tuple of length ``A``
        where each ``P[a]`` is a scipy sparse CSR format matrix of shape
        ``(S, S)``; R remains the same as in the case of ``is_sparse=False``.
123

124
125
    Examples
    --------
126
127
    >>> import mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
128
129
130
131
132
133
134
135
136
137
138
139
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
    <BLANKLINE>
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    >>> Psp, Rsp = mdptoolbox.example.forest(is_sparse=True)
    >>> len(Psp)
    2
    >>> Psp[0]
    <3x3 sparse matrix of type '<type 'numpy.float64'>'
        with 6 stored elements in Compressed Sparse Row format>
    >>> Psp[1]
    <3x3 sparse matrix of type '<type 'numpy.int64'>'
        with 3 stored elements in Compressed Sparse Row format>
    >>> Rsp
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    >>> (Psp[0].todense() == P[0]).all()
    True
    >>> (Rsp == R).all()
    True
157

158
    """
159
160
161
    assert S > 1, "The number of states S must be greater than 1."
    assert (r1 > 0) and (r2 > 0), "The rewards must be non-negative."
    assert 0 <= p <= 1, "The probability p must be in [0; 1]."
162
    # Definition of Transition matrix
163
164
    if is_sparse:
        P = []
Steven Cordwell's avatar
Steven Cordwell committed
165
166
        rows = list(range(S)) * 2
        cols = [0] * S + list(range(1, S)) + [S - 1]
167
168
        vals = [p] * S + [1-p] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
Steven Cordwell's avatar
Steven Cordwell committed
169
        rows = list(range(S))
170
171
172
173
174
175
176
177
178
179
        cols = [0] * S
        vals = [1] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
    else:
        P = zeros((2, S, S))
        P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
        P[1, :, :] = zeros((S, S))
        P[1, :, 0] = 1
180
    # Definition of Reward matrix
181
182
183
184
185
186
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    # we want to return the generated transition and reward matrices
187
    return(P, R)
188

189
def rand(S, A, is_sparse=False, mask=None):
190
    """Generate a random Markov Decision Process.
191

192
193
194
    Parameters
    ----------
    S : int
195
        Number of states (> 1)
196
    A : int
197
198
199
200
201
202
203
        Number of actions (> 1)
    is_sparse : bool, optional
        False to have matrices in dense format, True to have sparse matrices.
        Default: False.
    mask : array, optional
        Array with 0 and 1 (0 indicates a place for a zero probability), shape
        can be ``(S, S)`` or ``(A, S, S)``. Default: random.
204

205
206
207
    Returns
    -------
    out : tuple
208
209
210
211
212
213
214
        ``out[0]`` contains the transition probability matrix P  and ``out[1]``
        contains the reward matrix R. If ``is_sparse=False`` then P is a numpy
        array with a shape of ``(A, S, S)`` and R is a numpy array with a shape
        of ``(S, A)``. If ``is_sparse=True`` then P and R are tuples of length
        ``A``, where each ``P[a]`` is a scipy sparse CSR format matrix of shape
        ``(S, S)`` and each ``R[a]`` is a scipy sparse csr format matrix of
        shape ``(S, 1)``.
215

216
217
    Examples
    --------
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    >>> import numpy, mdptoolbox.example
    >>> numpy.random.seed(0) # Needed to get the output below
    >>> P, R = mdptoolbox.example.rand(4, 3)
    >>> P
    array([[[ 0.21977283,  0.14889403,  0.30343592,  0.32789723],
            [ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.        ,  0.43718772,  0.54480359,  0.01800869],
            [ 0.39766289,  0.39997167,  0.12547318,  0.07689227]],
    <BLANKLINE>
           [[ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.32261337,  0.15483812,  0.32271303,  0.19983549],
            [ 0.33816885,  0.2766999 ,  0.12960299,  0.25552826],
            [ 0.41299411,  0.        ,  0.58369957,  0.00330633]],
    <BLANKLINE>
           [[ 0.32343037,  0.15178596,  0.28733094,  0.23745272],
            [ 0.36348538,  0.24483321,  0.16114188,  0.23053953],
            [ 1.        ,  0.        ,  0.        ,  0.        ],
            [ 0.        ,  0.        ,  1.        ,  0.        ]]])
    >>> R
    array([[[-0.23311696,  0.58345008,  0.05778984,  0.13608912],
            [-0.07704128,  0.        , -0.        ,  0.        ],
            [ 0.        ,  0.22419145,  0.23386799,  0.88749616],
            [-0.3691433 , -0.27257846,  0.14039354, -0.12279697]],
    <BLANKLINE>
           [[-0.77924972,  0.        , -0.        , -0.        ],
            [ 0.47852716, -0.92162442, -0.43438607, -0.75960688],
            [-0.81211898,  0.15189299,  0.8585924 , -0.3628621 ],
            [ 0.35563307, -0.        ,  0.47038804,  0.92437709]],
    <BLANKLINE>
           [[-0.4051261 ,  0.62759564, -0.20698852,  0.76220639],
            [-0.9616136 , -0.39685037,  0.32034707, -0.41984479],
            [-0.13716313,  0.        , -0.        , -0.        ],
            [ 0.        , -0.        ,  0.55810204,  0.        ]]])
    >>> numpy.random.seed(0) # Needed to get the output below
    >>> Psp, Rsp = mdptoolbox.example.rand(100, 5, is_sparse=True)
    >>> len(Psp), len(Rsp)
    (5, 5)
    >>> Psp[0]
    <100x100 sparse matrix of type '<type 'numpy.float64'>'
        with 3296 stored elements in Compressed Sparse Row format>
    >>> Rsp[0]
    <100x100 sparse matrix of type '<type 'numpy.float64'>'
        with 3296 stored elements in Compressed Sparse Row format>
    >>> # The number of non-zero elements (nnz) in P and R are equal
    >>> Psp[1].nnz == Rsp[1].nnz
    True
264

265
266
    """
    # making sure the states and actions are more than one
267
268
    assert S > 1, "The number of states S must be greater than 1."
    assert A > 1, "The number of actions A must be greater than 1."
269
270
271
272
    # if the user hasn't specified a mask, then we will make a random one now
    if mask is not None:
        # the mask needs to be SxS or AxSxS
        try:
273
274
            assert mask.shape in ((S, S), (A, S, S)), "'mask' must have " \
            "dimensions S×S or A×S×S."
275
        except AttributeError:
276
            raise TypeError("'mask' must be a numpy array or matrix.")
277
278
279
280
281
282
    # generate the transition and reward matrices based on S, A and mask
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
        P = [None] * A
        # definition of reward matrix (values between -1 and +1)
        R = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
283
        for a in range(A):
284
285
286
287
288
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
            PP = dok_matrix((S, S))
            RR = dok_matrix((S, S))
Steven Cordwell's avatar
Steven Cordwell committed
289
            for s in range(S):
290
                if mask is None:
291
                    m = random(S)
292
293
294
295
296
297
298
299
300
301
                    m[m <= 2/3.0] = 0
                    m[m > 2/3.0] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                n = int(m.sum()) # m[s, :]
                if n == 0:
                    m[randint(0, S)] = 1
                    n = 1
302
303
304
305
306
307
                # find the columns of the vector that have non-zero elements
                nz = m.nonzero()
                if len(nz) == 1:
                    cols = nz[0]
                else:
                    cols = nz[1]
308
                vals = random(n)
309
                vals = vals / vals.sum()
310
                reward = 2*random(n) - ones(n)
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
                PP[s, cols] = vals
                RR[s, cols] = reward
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
            for s in range(S):
                # create our own random mask if there is no user supplied one
                if mask is None:
327
                    m = random(S)
328
329
330
331
332
333
334
335
336
337
                    r = random()
                    m[m <= r] = 0
                    m[m > r] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                # Make sure that there is atleast one transition in each state
                if m.sum() == 0:
                    m[randint(0, S)] = 1
338
                P[a][s] = m * random(S)
339
                P[a][s] = P[a][s] / P[a][s].sum()
340
                R[a][s] = (m * (2*random(S) - ones(S, dtype=int)))
341
    # we want to return the generated transition and reward matrices
342
    return(P, R)
343
344
345

if __name__ == "__main__":
    import doctest
346
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)