example.py 8.99 KB
Newer Older
1
2
3
4
5
6
7
8
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 18 14:32:25 2013

@author: steve
"""

from numpy import diag, ones, where, zeros
9
from numpy.random import randint, random
10
11
from scipy.sparse import coo_matrix, dok_matrix

12
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    """Generate a MDP example based on a simple forest management scenario.
    
    This function is used to generate a transition probability
    (``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
    ``R`` that model the following problem. A forest is managed by two actions:
    'Wait' and 'Cut'. An action is decided each year with first the objective
    to maintain an old forest for wildlife and second to make money selling cut
    wood. Each year there is a probability ``p`` that a fire burns the forest.
    
    Here is how the problem is modelled.
    Let {1, 2 . . . ``S`` } be the states of the forest, with ``S`` being the 
    oldest. Let 'Wait' be action 1 and 'Cut' action 2.
    After a fire, the forest is in the youngest state, that is state 1.
    The transition matrix P of the problem can then be defined as follows::
        
                   | p 1-p 0.......0  |
                   | .  0 1-p 0....0  |
        P[1,:,:] = | .  .  0  .       |
                   | .  .        .    |
                   | .  .         1-p |
                   | p  0  0....0 1-p |
        
                   | 1 0..........0 |
                   | . .          . |
        P[2,:,:] = | . .          . |
                   | . .          . |
                   | . .          . |
                   | 1 0..........0 |
    
    The reward matrix R is defined as follows::
        
                 |  0  |
                 |  .  |
        R[:,1] = |  .  |
                 |  .  |
                 |  0  |
                 |  r1 |
        
                 |  0  |
                 |  1  |
        R[:,2] = |  .  |
                 |  .  |
                 |  1  |
                 |  r2 |
    
    Parameters
    ---------
    S : int, optional
        The number of states, which should be an integer greater than 0. By
        default it is 3.
    r1 : float, optional
        The reward when the forest is in its oldest state and action 'Wait' is
        performed. By default it is 4.
    r2 : float, optional
        The reward when the forest is in its oldest state and action 'Cut' is
        performed. By default it is 2.
    p : float, optional
        The probability of wild fire occurence, in the range ]0, 1[. By default
        it is 0.1.
    
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).
    
    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleForest()
    >>> P
    array([[[ 0.1,  0.9,  0. ],
            [ 0.1,  0. ,  0.9],
            [ 0.1,  0. ,  0.9]],
    <BLANKLINE>
           [[ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ],
            [ 1. ,  0. ,  0. ]]])
    >>> R
    array([[ 0.,  0.],
           [ 0.,  1.],
           [ 4.,  2.]])
    
    """
    if S <= 1:
99
        raise ValueError("The number of states 'S' must be greater than 1.")
100
    if (r1 <= 0) or (r2 <= 0):
101
        raise ValueError("The rewards must be greater than non-negative.")
102
    if (p < 0) or (p > 1):
103
        raise ValueError("The probability 'p' must be in [0; 1].")
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    # Definition of Transition matrix P(:,:,1) associated to action Wait
    # (action 1) and P(:,:,2) associated to action Cut (action 2)
    #             | p 1-p 0.......0  |                  | 1 0..........0 |
    #             | .  0 1-p 0....0  |                  | . .          . |
    #  P(:,:,1) = | .  .  0  .       |  and P(:,:,2) =  | . .          . |
    #             | .  .        .    |                  | . .          . |
    #             | .  .         1-p |                  | . .          . |
    #             | p  0  0....0 1-p |                  | 1 0..........0 |
    if is_sparse:
        P = []
        rows = range(S) * 2
        cols = [0] * S + range(1, S) + [S - 1]
        vals = [p] * S + [1-p] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
        rows = range(S)
        cols = [0] * S
        vals = [1] * S
        P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
    else:
        P = zeros((2, S, S))
        P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
        P[0, :, 0] = p
        P[0, S - 1, S - 1] = (1 - p)
        P[1, :, :] = zeros((S, S))
        P[1, :, 0] = 1
    # Definition of Reward matrix R1 associated to action Wait and 
    # R2 associated to action Cut
    #           | 0  |                   | 0  |
    #           | .  |                   | 1  |
    #  R(:,1) = | .  |  and     R(:,2) = | .  |	
    #           | .  |                   | .  |
    #           | 0  |                   | 1  |                   
    #           | r1 |                   | r2 |
    R = zeros((S, 2))
    R[S - 1, 0] = r1
    R[:, 1] = ones(S)
    R[0, 1] = 0
    R[S - 1, 1] = r2
    # we want to return the generated transition and reward matrices
    return (P, R)

145
def rand(S, A, is_sparse=False, mask=None):
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    """Generate a random Markov Decision Process.
    
    Parameters
    ----------
    S : int
        number of states (> 0)
    A : int
        number of actions (> 0)
    is_sparse : logical, optional
        false to have matrices in dense format, true to have sparse
        matrices (default false).
    mask : array or None, optional
        matrix with 0 and 1 (0 indicates a place for a zero
        probability), (SxS) (default, random)
    
    Returns
    -------
    out : tuple
        ``out[1]`` contains the transition probability matrix P with a shape of
        (A, S, S). ``out[2]`` contains the reward matrix R with a shape of
        (S, A).

    Examples
    --------
    >>> import mdp
    >>> P, R = mdp.exampleRand(5, 3)
    
    """
    # making sure the states and actions are more than one
    if (S < 1) or (A < 1):
176
177
        raise ValueError("The number of states S and the number of actions A "
        "must be greater than 1.")
178
179
180
181
182
    # if the user hasn't specified a mask, then we will make a random one now
    if mask is not None:
        # the mask needs to be SxS or AxSxS
        try:
            if mask.shape not in ((S, S), (A, S, S)):
183
                raise ValueError("'mask' must have dimensions S×S or A×S×S.")
184
        except AttributeError:
185
            raise TypeError("'mask' must be a numpy array or matrix.")
186
187
188
189
190
191
192
193
194
195
196
197
198
199
    # generate the transition and reward matrices based on S, A and mask
    if is_sparse:
        # definition of transition matrix : square stochastic matrix
        P = [None] * A
        # definition of reward matrix (values between -1 and +1)
        R = [None] * A
        for a in xrange(A):
            # it may be more efficient to implement this by constructing lists
            # of rows, columns and values then creating a coo_matrix, but this
            # works for now
            PP = dok_matrix((S, S))
            RR = dok_matrix((S, S))
            for s in xrange(S):
                if mask is None:
200
                    m = random(S)
201
202
203
204
205
206
207
208
209
210
211
                    m[m <= 2/3.0] = 0
                    m[m > 2/3.0] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                n = int(m.sum()) # m[s, :]
                if n == 0:
                    m[randint(0, S)] = 1
                    n = 1
                cols = where(m)[0] # m[s, :]
212
                vals = random(n)
213
                vals = vals / vals.sum()
214
                reward = 2*random(n) - ones(n)
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
                PP[s, cols] = vals
                RR[s, cols] = reward
            # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
            # so constructing PP and RR as coo_matrix in the first place is
            # probably "better"
            P[a] = PP.tocsr()
            R[a] = RR.tocsr()
    else:
        # definition of transition matrix : square stochastic matrix
        P = zeros((A, S, S))
        # definition of reward matrix (values between -1 and +1)
        R = zeros((A, S, S))
        for a in range(A):
            for s in range(S):
                # create our own random mask if there is no user supplied one
                if mask is None:
231
                    m = random(S)
232
233
234
235
236
237
238
239
240
241
242
                    r = random()
                    m[m <= r] = 0
                    m[m > r] = 1
                elif mask.shape == (A, S, S):
                    m = mask[a][s] # mask[a, s, :]
                else:
                    m = mask[s]
                # Make sure that there is atleast one transition in each state
                if m.sum() == 0:
                    m[randint(0, S)] = 1
                    n = 1
243
                P[a][s] = m * random(S)
244
                P[a][s] = P[a][s] / P[a][s].sum()
245
                R[a][s] = (m * (2*random(S) - ones(S, dtype=int)))
246
247
    # we want to return the generated transition and reward matrices
    return (P, R)