example.py 10.8 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
3
4
5
6
7
8
9
10
11
12
13
"""Markov Decision Process (MDP) Toolbox: ``example`` module
=========================================================

The ``example`` module provides functions to generate valid MDP transition and
reward matrices that are valid.

Available functions
-------------------
forest
    A simple forest management example
rand
    A random example
14
15
16

"""

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
# 
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the <ORGANIZATION> nor the names of its contributors
#     may be used to endorse or promote products derived from this software
#     without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

46
from numpy import diag, ones, where, zeros
47
from numpy.random import randint, random
48
49
from scipy.sparse import coo_matrix, dok_matrix

50
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    """Generate a MDP example based on a simple forest management scenario.
    
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
    
    Here is how the problem is modelled.
    Let {1, 2 . . . ``S`` } be the states of the forest, with ``S`` being the 
    oldest. Let 'Wait' be action 1 and 'Cut' action 2.
    After a fire, the forest is in the youngest state, that is state 1.
    The transition matrix P of the problem can then be defined as follows::
        
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
        P[1,:,:] = | .  .  0  .       |
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
        
                   | 1 0..........0 |
                   | . .          . |
        P[2,:,:] = | . .          . |
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
    
    The reward matrix R is defined as follows::
        
                 |  0  |
                 |  .  |
        R[:,1] = |  .  |
                 |  .  |
                 |  0  |
                 |  r1 |
        
                 |  0  |
                 |  1  |
        R[:,2] = |  .  |
                 |  .  |
                 |  1  |
                 |  r2 |
    
    Parameters
    ---------
    S : int, optional
        The number of states, which should be an integer greater than 0. By
        default it is 3.
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
        performed. By default it is 4.
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
        performed. By default it is 2.
    p : float, optional
        The probability of wild fire occurence, in the range ]0, 1[. By default
        it is 0.1.
    
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
    
    Examples
    --------
120
121
    >>> import mdptoolbox.example
    >>> P, R = mdptoolbox.example.forest()
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
    <BLANKLINE>
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    
    """
136
137
138
    assert S > 1, "The number of states S must be greater than 1."
    assert (r1 > 0) and (r2 > 0), "The rewards must be non-negative."
    assert 0 <= p <= 1, "The probability p must be in [0; 1]."
139
140
141
142
143
144
145
146
147
148
    # Definition of Transition matrix P(:,:,1) associated to action Wait
    # (action 1) and P(:,:,2) associated to action Cut (action 2)
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
    if is_sparse:
        P = []
Steven Cordwell's avatar
Steven Cordwell committed
149
150
        rows = list(range(S)) * 2
        cols = [0] * S + list(range(1, S)) + [S - 1]
151
152
        vals = [p] * S + [1-p] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
Steven Cordwell's avatar
Steven Cordwell committed
153
        rows = list(range(S))
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
        cols = [0] * S
        vals = [1] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
    else:
        P = zeros((2, S, S))
        P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
        P[1, :, :] = zeros((S, S))
        P[1, :, 0] = 1
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    # we want to return the generated transition and reward matrices
    return (P, R)

180
def rand(S, A, is_sparse=False, mask=None):
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
    """Generate a random Markov Decision Process.
    
    Parameters
    ----------
    S : int
        number of states (> 0)
    A : int
        number of actions (> 0)
    is_sparse : logical, optional
        false to have matrices in dense format, true to have sparse
        matrices (default false).
    mask : array or None, optional
        matrix with 0 and 1 (0 indicates a place for a zero
        probability), (SxS) (default, random)
    
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).

    Examples
    --------
205
206
    >>> import mdptoolbox.example
    >>> P, R = mdptoolbox.example.rand(5, 3)
207
208
209
    
    """
    # making sure the states and actions are more than one
210
211
    assert S > 1, "The number of states S must be greater than 1."
    assert A > 1, "The number of actions A must be greater than 1."
212
213
214
215
    # if the user hasn't specified a mask, then we will make a random one now
    if mask is not None:
        # the mask needs to be SxS or AxSxS
        try:
216
217
            assert mask.shape in ((S, S), (A, S, S)), "'mask' must have " \
            "dimensions S×S or A×S×S."
218
        except AttributeError:
219
            raise TypeError("'mask' must be a numpy array or matrix.")
220
221
222
223
224
225
    # generate the transition and reward matrices based on S, A and mask
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
        P = [None] * A
        # definition of reward matrix (values between -1 and +1)
        R = [None] * A
Steven Cordwell's avatar
Steven Cordwell committed
226
        for a in range(A):
227
228
229
230
231
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
            PP = dok_matrix((S, S))
            RR = dok_matrix((S, S))
Steven Cordwell's avatar
Steven Cordwell committed
232
            for s in range(S):
233
                if mask is None:
234
                    m = random(S)
235
236
237
238
239
240
241
242
243
244
245
                    m[m <= 2/3.0] = 0
                    m[m > 2/3.0] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                n = int(m.sum()) # m[s, :]
                if n == 0:
                    m[randint(0, S)] = 1
                    n = 1
                cols = where(m)[0] # m[s, :]
246
                vals = random(n)
247
                vals = vals / vals.sum()
248
                reward = 2*random(n) - ones(n)
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
                PP[s, cols] = vals
                RR[s, cols] = reward
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
            for s in range(S):
                # create our own random mask if there is no user supplied one
                if mask is None:
265
                    m = random(S)
266
267
268
269
270
271
272
273
274
275
276
                    r = random()
                    m[m <= r] = 0
                    m[m > r] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                # Make sure that there is atleast one transition in each state
                if m.sum() == 0:
                    m[randint(0, S)] = 1
                    n = 1
277
                P[a][s] = m * random(S)
278
                P[a][s] = P[a][s] / P[a][s].sum()
279
                R[a][s] = (m * (2*random(S) - ones(S, dtype=int)))
280
281
    # we want to return the generated transition and reward matrices
    return (P, R)
282
283
284
285

if __name__ == "__main__":
    import doctest
    doctest.testmod()