Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
5f7f035b
Commit
5f7f035b
authored
Jan 24, 2013
by
Steven Cordwell
Browse files
fixes and unittest fixes for ValueIteration and PolicyIteration classes
parent
fa69c0d3
Changes
2
Hide whitespace changes
Inline
Side-by-side
mdp.py
View file @
5f7f035b
...
...
@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from
numpy
import
absolute
,
array
,
diag
,
matrix
,
mean
,
mod
,
multiply
,
ndarray
from
numpy
import
ones
,
zeros
from
numpy
import
nonzero
,
ones
,
zeros
from
numpy.random
import
rand
from
math
import
ceil
,
log
,
sqrt
from
random
import
randint
,
random
...
...
@@ -419,6 +419,13 @@ def exampleRand(S, A, is_sparse=False, mask=None):
return
(
P
,
R
)
def
getSpan
(
self
,
W
):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return
(
W
.
max
()
-
W
.
min
())
class
MDP
(
object
):
"""The Markov Decision Problem Toolbox."""
...
...
@@ -502,27 +509,31 @@ class MDP(object):
Ppolicy(SxS) = transition matrix for policy
PRpolicy(S) = reward matrix for policy
"""
Ppolicy
=
zeros
(())
PRpolicy
=
zeros
(())
for
a
in
range
(
self
.
A
):
# avoid looping over S
ind
=
find
(
self
.
policy
==
a
)
# the rows that use action a
if
not
isempty
(
ind
):
if
iscell
(
P
):
Ppolicy
[
ind
,:]
=
self
.
P
[
a
][
ind
,:]
else
:
Ppolicy
[
ind
,:]
=
self
.
P
[
ind
,:,
a
]
Ppolicy
=
matrix
(
zeros
((
self
.
S
,
self
.
S
)))
Rpolicy
=
matrix
(
zeros
((
self
.
S
,
1
)))
for
aa
in
range
(
self
.
A
):
# avoid looping over S
# the rows that use action a. .getA1() is used to make sure that
# ind is a 1 dimensional vector
ind
=
nonzero
(
self
.
policy
==
aa
)[
0
].
getA1
()
if
ind
.
size
>
0
:
# if no rows use action a, then no point continuing
Ppolicy
[
ind
,
:]
=
self
.
P
[
aa
][
ind
,
:]
PR
=
self
.
computePR
()
PRpolicy
[
ind
,
1
]
=
PR
[
ind
,
a
]
# self.R cannot be sparse with the code in its current condition
#PR = self.computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy
[
ind
]
=
self
.
R
[
ind
,
aa
]
# self.R cannot be sparse with the code in its current condition, but
# it should be possible in the future. Also, if R is so big that its
# a good idea to use a sparse matrix for it, then converting PRpolicy
# from a dense to sparse matrix doesn't seem very memory efficient
if
type
(
self
.
R
)
is
sparse
:
P
Rpolicy
=
sparse
(
P
Rpolicy
)
Rpolicy
=
sparse
(
Rpolicy
)
#self.Ppolicy = Ppolicy
#self.Rpolicy =
P
Rpolicy
return
(
Ppolicy
,
P
Rpolicy
)
#self.Rpolicy = Rpolicy
return
(
Ppolicy
,
Rpolicy
)
def
computePR
(
self
,
P
,
R
):
"""Computes the reward for the system in one state chosing an action
...
...
@@ -584,13 +595,6 @@ class MDP(object):
"""
raise
NotImplementedError
(
"You should create an iterate() method."
)
def
getSpan
(
self
,
W
):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return
(
W
.
max
()
-
W
.
min
())
def
setSilent
(
self
):
"""Ask for running resolution functions of the MDP Toolbox in silent
mode.
...
...
@@ -804,6 +808,8 @@ class PolicyIteration(MDP):
if
eval_type
in
(
0
,
"matrix"
):
from
numpy.linalg
import
solve
from
scipy.sparse
import
eye
self
.
speye
=
eye
self
.
lin_eq
=
solve
self
.
eval_type
=
"matrix"
elif
eval_type
in
(
1
,
"iterative"
):
...
...
@@ -844,11 +850,11 @@ class PolicyIteration(MDP):
epsilon-optimum value function found or maximum number of iterations reached.
"""
if
V0
==
0
:
V
policy
=
zeros
(
self
.
S
,
1
)
policy
_V
=
zeros
(
(
self
.
S
,
1
)
)
else
:
raise
NotImplementedError
(
"evalPolicyIterative: case V0 != 0 not implemented. Use V0=0 instead."
)
P
policy
,
PR
policy
=
self
.
computePpolicyPRpolicy
(
self
.
P
,
self
.
R
,
self
.
policy
)
policy
_P
,
policy
_R
=
self
.
computePpolicyPRpolicy
()
if
self
.
verbose
:
print
(
' Iteration V_variation'
)
...
...
@@ -857,21 +863,24 @@ class PolicyIteration(MDP):
done
=
False
while
not
done
:
itr
=
itr
+
1
Vprev
=
Vpolicy
Vpolicy
=
PRpolicy
+
self
.
discount
*
Ppolicy
*
Vprev
variation
=
max
(
abs
(
Vpolicy
-
Vprev
))
Vprev
=
policy_V
policy_V
=
policy_R
+
self
.
discount
*
policy_P
*
Vprev
variation
=
absolute
(
policy_V
-
Vprev
).
max
()
if
self
.
verbose
:
print
(
' %s %s'
)
%
(
itr
,
variation
)
if
variation
<
((
1
-
self
.
discount
)
/
self
.
discount
)
*
epsilon
:
# to ensure |Vn - Vpolicy| < epsilon
done
=
True
if
self
.
verbose
:
print
(
'MDP
T
oolbox: iterations stopped, epsilon-optimal value function'
)
print
(
'
Py
MDP
t
oolbox: iterations stopped, epsilon-optimal value function'
)
elif
itr
==
max_iter
:
done
=
True
if
self
.
verbose
:
print
(
'MDP
T
oolbox: iterations stopped by maximum number of iteration condition'
)
print
(
'
Py
MDP
t
oolbox: iterations stopped by maximum number of iteration condition'
)
self
.
value
=
V
policy
self
.
value
=
policy
_V
def
evalPolicyMatrix
(
self
):
"""Evaluation of the value function of a policy
...
...
@@ -894,12 +903,12 @@ class PolicyIteration(MDP):
Vpolicy(S) = value function of the policy
"""
Ppolicy
,
P
Rpolicy
=
self
.
computePpolicyPRpolicy
(
self
.
P
,
self
.
R
,
self
.
policy
)
Ppolicy
,
Rpolicy
=
self
.
computePpolicyPRpolicy
()
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self
.
value
=
self
.
lin_eq
((
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
P
Rpolicy
)
self
.
value
=
self
.
lin_eq
((
self
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
Rpolicy
)
def
iterate
(
self
):
""""""
"""
Run the policy iteration algorithm.
"""
if
self
.
verbose
:
print
(
' Iteration Number_of_different_actions'
)
...
...
@@ -927,8 +936,16 @@ class PolicyIteration(MDP):
if
self
.
verbose
:
print
(
' %s %s'
)
%
(
self
.
iter
,
n_different
)
if
(
policy_next
==
self
.
policy
).
all
()
or
(
self
.
iter
==
self
.
max_iter
)
:
if
n_different
==
0
:
done
=
True
if
self
.
verbose
:
print
(
"...iterations stopped, unchanging policy found"
)
elif
(
self
.
iter
==
self
.
max_iter
):
done
=
True
if
self
.
verbose
:
print
(
"...iterations stopped by maximum number of iteration condition"
)
else
:
self
.
policy
=
policy_next
self
.
time
=
time
()
-
self
.
time
...
...
@@ -1009,7 +1026,7 @@ class PolicyIterationModified(MDP):
Vnext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
PR
,
self
.
discount
,
self
.
V
)
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation
=
self
.
getSpan
(
Vnext
-
self
.
value
);
variation
=
getSpan
(
Vnext
-
self
.
value
);
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1237,7 +1254,7 @@ class RelativeValueIteration(MDP):
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
1
,
self
.
U
)
Unext
=
Unext
-
self
.
gain
variation
=
self
.
getSpan
(
Unext
-
self
.
U
)
variation
=
getSpan
(
Unext
-
self
.
U
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1382,8 +1399,8 @@ class ValueIteration(MDP):
if
(
initial_value
==
0
):
self
.
value
=
matrix
(
zeros
((
self
.
S
,
1
)))
else
:
if
(
initial_value
.
s
ize
!=
self
.
S
):
raise
ValueError
(
"The initial value must be length S"
)
if
(
not
initial_value
.
s
hape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
)
))
:
raise
ValueError
(
"The initial value must be
a vector of
length S"
)
else
:
self
.
value
=
matrix
(
initial_value
)
...
...
@@ -1431,10 +1448,10 @@ class ValueIteration(MDP):
k
=
1
-
h
.
sum
()
Vprev
=
self
.
value
self
.
bellmanOperator
()
null
,
value
=
self
.
bellmanOperator
()
# p 201, Proposition 6.6.5
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
self
.
getSpan
(
self
.
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
self
.
value
=
Vprev
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
#
self.value = Vprev
self
.
max_iter
=
ceil
(
max_iter
)
...
...
@@ -1458,7 +1475,7 @@ class ValueIteration(MDP):
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# finds the maximum of the the rows. (Operates along the columns?)
variation
=
self
.
getSpan
(
self
.
value
-
Vprev
)
variation
=
getSpan
(
self
.
value
-
Vprev
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1566,7 +1583,7 @@ class ValueIterationGS(ValueIteration):
Q
[
a
]
=
self
.
R
[
s
,
a
]
+
self
.
discount
*
self
.
P
[
a
][
s
,:]
*
self
.
value
self
.
value
[
s
]
=
max
(
Q
)
variation
=
self
.
getSpan
(
V
-
Vprev
)
variation
=
getSpan
(
V
-
Vprev
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
test_mdptoolbox.py
View file @
5f7f035b
...
...
@@ -172,10 +172,10 @@ R = array([[5, 10], [-1, 2]])
# MDP
def
test_MDP_P_R_1
():
P1
=
zeros
((
ACTIONS
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
[[
0.5
,
0.5
],[
0.8
,
0.2
]]
)
P1
[
1
]
=
matrix
(
[[
0
,
1
],[
0.1
,
0.9
]]
)
R1
=
matrix
(
[[
5
,
10
],
[
-
1
,
2
]]
)
P1
=
zeros
((
2
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
'
0.5 0.5
;
0.8 0.2
'
)
P1
[
1
]
=
matrix
(
'0 1;
0.1 0.9
'
)
R1
=
matrix
(
'5 10;
-1 2
'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
a
.
P
.
dtype
==
P1
.
dtype
assert
a
.
R
.
dtype
==
R1
.
dtype
...
...
@@ -185,10 +185,10 @@ def test_MDP_P_R_1():
def
test_MDP_P_R_2
():
R
=
array
([[[
5
,
10
],
[
-
1
,
2
]],
[[
1
,
2
],
[
3
,
4
]]])
P1
=
zeros
((
ACTIONS
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
[[
0.5
,
0.5
],[
0.8
,
0.2
]]
)
P1
[
1
]
=
matrix
(
[[
0
,
1
],[
0.1
,
0.9
]]
)
R1
=
matrix
(
[[
7.5
,
2
],
[
-
0.4
,
3.9
]]
)
P1
=
zeros
((
2
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
'
0.5 0.5
;
0.8 0.2
'
)
P1
[
1
]
=
matrix
(
'0 1;
0.1 0.9
'
)
R1
=
matrix
(
'
7.5 2
;
-0.4 3.9
'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
type
(
a
.
P
)
==
type
(
P1
)
assert
type
(
a
.
R
)
==
type
(
R1
)
...
...
@@ -201,7 +201,7 @@ def test_MDP_P_R_2():
def
test_MDP_P_R_3
():
P
=
array
([[[
0.6116
,
0.3884
],[
0
,
1
]],[[
0.6674
,
0.3326
],[
0
,
1
]]])
R
=
array
([[[
-
0.2433
,
0.7073
],[
0
,
0.1871
]],[[
-
0.0069
,
0.6433
],[
0
,
0.2898
]]])
PR
=
matrix
(
[[
0.12591304
,
0.20935652
],
[
0.1871
,
0.2898
]]
)
PR
=
matrix
(
'
0.12591304 0.20935652
;
0.1871 0.2898
'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
(
absolute
(
a
.
R
-
PR
)
<
SMALLNUM
).
all
()
...
...
@@ -229,16 +229,73 @@ def test_ValueIteration_exampleForest():
def
test_PolicyIteration_init_policy0
():
a
=
PolicyIteration
(
P
,
R
,
0.9
)
p
=
array
((
1
,
1
)).
reshape
(
2
,
1
)
assert
(
absolute
(
a
.
policy
-
p
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration
():
PolicyIteration
(
P
,
R
,
0.9
)
#inst.iterate()
#assert (abs(inst.value[0] - 42.4419) < 0.001)
#assert (abs(inst.value[1] - 36.0465) < 0.001)
#assert (inst.policy == (1, 0))
#assert (inst.iter == 2)
p
=
matrix
(
'1; 1'
)
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_init_policy0_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
p
=
matrix
(
'0; 1; 0'
)
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_computePpolicyPRpolicy_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
P1
=
matrix
(
'0.1 0.9 0; 1 0 0; 0.1 0 0.9'
)
R1
=
matrix
(
'0; 1; 4'
)
Ppolicy
,
Rpolicy
=
a
.
computePpolicyPRpolicy
()
assert
(
absolute
(
Ppolicy
-
P1
)
<
SMALLNUM
).
all
()
assert
(
absolute
(
Rpolicy
-
R1
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_evalPolicyIterative_exampleForest
():
P
,
R
=
exampleForest
()
v0
=
matrix
(
'0; 0; 0'
)
v1
=
matrix
(
'4.47504640074458; 5.02753258879703; 23.17234211944304'
)
p
=
matrix
(
'0; 1; 0'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
assert
(
absolute
(
a
.
value
-
v0
)
<
SMALLNUM
).
all
()
a
.
evalPolicyIterative
()
assert
(
absolute
(
a
.
value
-
v1
)
<
SMALLNUM
).
all
()
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_evalPolicyIterative_bellmanOperator_exampleForest
():
P
,
R
=
exampleForest
()
v
=
matrix
(
'4.47504640074458; 5.02753258879703; 23.17234211944304'
)
p
=
matrix
(
'0; 0; 0'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
a
.
evalPolicyIterative
()
policy
,
value
=
a
.
bellmanOperator
()
assert
(
policy
==
p
).
all
()
assert
(
absolute
(
a
.
value
-
v
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_iterative_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
,
eval_type
=
1
)
V
=
matrix
(
'26.2439058351861 29.4839058351861 33.4839058351861'
)
p
=
matrix
(
'0 0 0'
)
itr
=
2
a
.
iterate
()
assert
(
absolute
(
array
(
a
.
value
)
-
V
)
<
SMALLNUM
).
all
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
def
test_PolicyIteration_evalPolicyMatrix_exampleForest
():
P
,
R
=
exampleForest
()
v_pol
=
matrix
(
'4.47513812154696; 5.02762430939227; 23.17243384704857'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
a
.
evalPolicyMatrix
()
assert
(
absolute
(
a
.
value
-
v_pol
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_matrix_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
V
=
matrix
(
'26.2440000000000 29.4840000000000 33.4840000000000'
)
p
=
matrix
(
'0 0 0'
)
itr
=
2
a
.
iterate
()
assert
(
absolute
(
array
(
a
.
value
)
-
V
)
<
SMALLNUM
).
all
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
#def test_JacksCarRental():
# S = 21 ** 2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment