Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
0660caca
Commit
0660caca
authored
Jan 25, 2013
by
Steven Cordwell
Browse files
completed RelativeValueIteration class to a useful state
parent
4f17fb5f
Changes
2
Hide whitespace changes
Inline
Side-by-side
mdp.py
View file @
0660caca
...
...
@@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1):
array([[[ 0.1, 0.9, 0. ],
[ 0.1, 0. , 0.9],
[ 0.1, 0. , 0.9]],
[[ 1. , 0. , 0. ],
[ 1. , 0. , 0. ],
[ 1. , 0. , 0. ]]])
...
...
@@ -429,12 +428,12 @@ def getSpan(W):
class
MDP
(
object
):
"""The Markov Decision Problem Toolbox."""
def
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
,
max_iter
):
""""""
# if the discount is None then the algorithm is assumed to not use it
# in its computations
if
(
type
(
discount
)
i
s
int
)
or
(
type
(
discount
)
is
float
):
if
type
(
discount
)
i
n
(
int
,
float
):
if
(
discount
<=
0
)
or
(
discount
>
1
):
raise
ValueError
(
mdperr
[
"discount_rng"
])
else
:
...
...
@@ -448,8 +447,8 @@ class MDP(object):
# if the max_iter is None then the algorithm is assumed to not use it
# in its computations
if
(
type
(
max_iter
)
i
s
int
)
or
(
type
(
max_iter
)
is
float
):
if
(
max_iter
<=
0
)
:
if
type
(
max_iter
)
i
n
(
int
,
float
):
if
max_iter
<=
0
:
raise
ValueError
(
mdperr
[
"maxi_min"
])
else
:
self
.
max_iter
=
max_iter
...
...
@@ -457,6 +456,13 @@ class MDP(object):
raise
ValueError
(
"PyMDPtoolbox: max_iter must be a positive real "
\
"number greater than zero."
)
if
type
(
epsilon
)
in
(
int
,
float
):
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
elif
not
epsilon
is
None
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
\
"number greater than zero."
)
# we run a check on P and R to make sure they are describing an MDP. If
# an exception isn't raised then they are assumed to be correct.
check
(
transitions
,
reward
)
...
...
@@ -744,7 +750,7 @@ class PolicyIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
policy0
=
None
,
max_iter
=
1000
,
eval_type
=
0
):
""""""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
)
if
policy0
==
None
:
# initialise the policy to the one which maximises the expected
...
...
@@ -913,7 +919,7 @@ class PolicyIteration(MDP):
Ppolicy
,
Rpolicy
=
self
.
computePpolicyPRpolicy
()
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self
.
V
=
self
.
lin_eq
((
self
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
Rpolicy
)
self
.
V
=
self
.
lin_eq
((
self
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
),
Rpolicy
)
def
iterate
(
self
):
"""Run the policy iteration algorithm."""
...
...
@@ -961,7 +967,7 @@ class PolicyIteration(MDP):
self
.
V
=
tuple
(
array
(
self
.
V
).
reshape
(
self
.
S
).
tolist
())
self
.
policy
=
tuple
(
array
(
self
.
policy
).
reshape
(
self
.
S
).
tolist
())
class
PolicyIterationModified
(
MDP
):
class
PolicyIterationModified
(
PolicyIteration
):
"""Resolution of discounted MDP with policy iteration algorithm
Arguments
...
...
@@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
):
""""""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
PolicyIteration
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
,
1
)
if
epsilon
<=
0
:
raise
ValueError
(
"epsilon must be greater than 0"
)
# PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
# check it here
if
type
(
epsilon
)
in
(
int
,
float
):
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
else
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
\
"number greater than zero."
)
# computation of threshold of variation for V for an epsilon-optimal policy
if
self
.
discount
!=
1
:
...
...
@@ -1128,7 +1140,7 @@ class QLearning(MDP):
raise
ValueError
(
"PyMDPtoolbox: n_iter should be greater than 10000"
)
# after this n_iter will be known as self.max_iter
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
n_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
n_iter
)
# Initialisations
self
.
Q
=
zeros
((
self
.
S
,
self
.
A
))
...
...
@@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
epsilon
=
0.01
,
max_iter
=
1000
):
MDP
.
__init__
(
self
,
transitions
,
reward
,
None
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
None
,
epsilon
,
max_iter
)
if
epsilon
<=
0
:
print
(
'MDP Toolbox ERROR: epsilon must be upper than 0'
)
self
.
U
=
zeros
(
self
.
S
,
1
)
self
.
gain
=
self
.
U
[
self
.
S
]
self
.
epsilon
=
epsilon
self
.
discount
=
1
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
self
.
gain
=
0
# self.U[self.S]
self
.
average_reward
=
None
def
iterate
(
self
):
""""""
...
...
@@ -1259,29 +1273,33 @@ class RelativeValueIteration(MDP):
self
.
iter
=
self
.
iter
+
1
;
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
1
,
self
.
U
)
U
next
=
U
next
-
self
.
gain
self
.
policy
,
Vnext
=
self
.
bellmanOperator
()
V
next
=
V
next
-
self
.
gain
variation
=
getSpan
(
U
next
-
self
.
U
)
variation
=
getSpan
(
V
next
-
self
.
V
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
if
variation
<
self
.
epsilon
:
done
=
True
average_reward
=
self
.
gain
+
min
(
U
next
-
self
.
U
)
self
.
average_reward
=
self
.
gain
+
(
V
next
-
self
.
V
).
min
(
)
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped, epsilon-optimal policy found'
)
elif
self
.
iter
==
self
.
max_iter
:
done
=
True
average_reward
=
self
.
gain
+
min
(
U
next
-
self
.
U
);
self
.
average_reward
=
self
.
gain
+
(
V
next
-
self
.
V
).
min
()
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped by maximum number of iteration condition'
)
self
.
U
=
U
next
self
.
gain
=
self
.
U
(
self
.
S
)
self
.
V
=
V
next
self
.
gain
=
float
(
self
.
V
[
self
.
S
-
1
]
)
self
.
time
=
time
()
-
self
.
time
# store value and policy as tuples
self
.
V
=
tuple
(
self
.
V
.
getA1
().
tolist
())
self
.
policy
=
tuple
(
self
.
policy
.
getA1
().
tolist
())
class
ValueIteration
(
MDP
):
"""
...
...
@@ -1401,21 +1419,18 @@ class ValueIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
1000
,
initial_value
=
0
):
"""Resolution of discounted MDP with value iteration algorithm."""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
,
max_iter
)
# initialization of optional arguments
if
(
initial_value
==
0
)
:
if
initial_value
==
0
:
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
else
:
if
(
not
initial_value
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
))
)
:
if
not
initial_value
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
)):
raise
ValueError
(
"PyMDPtoolbox: The initial value must be a vector of length S"
)
else
:
self
.
V
=
matrix
(
initial_value
)
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
if
(
self
.
discount
<
1
):
if
self
.
discount
<
1
:
# compute a bound for the number of iterations and update the
# stored value of self.max_iter
self
.
boundIter
(
epsilon
)
...
...
@@ -1464,7 +1479,7 @@ class ValueIteration(MDP):
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
#self.V = Vprev
self
.
max_iter
=
ceil
(
max_iter
)
self
.
max_iter
=
int
(
ceil
(
max_iter
)
)
def
iterate
(
self
):
"""
...
...
test_mdptoolbox.py
View file @
0660caca
...
...
@@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012
"""
from
mdp
import
check
,
checkSquareStochastic
,
exampleForest
,
exampleRand
,
MDP
from
mdp
import
PolicyIteration
,
ValueIteration
,
ValueIterationGS
from
mdp
import
PolicyIteration
,
RelativeValueIteration
,
ValueIteration
from
mdp
import
ValueIterationGS
from
numpy
import
absolute
,
array
,
eye
,
matrix
,
zeros
from
numpy.random
import
rand
...
...
@@ -18,6 +19,13 @@ STATES = 10
ACTIONS
=
3
SMALLNUM
=
10e-12
# Arrays
P
=
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
R
=
array
([[
5
,
10
],
[
-
1
,
2
]])
Pf
,
Rf
=
exampleForest
()
Pr
,
Rr
=
exampleRand
(
STATES
,
ACTIONS
)
Prs
,
Rrs
=
exampleRand
(
STATES
,
ACTIONS
,
is_sparse
=
True
)
# check: square, stochastic and non-negative ndarrays
def
test_check_square_stochastic_nonnegative_array_1
():
...
...
@@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse():
assert
checkSquareStochastic
(
P
)
==
None
# exampleForest
Pf
,
Rf
=
exampleForest
()
def
test_exampleForest_P_shape
():
assert
(
Pf
==
array
([[[
0.1
,
0.9
,
0.0
],
...
...
@@ -151,8 +158,6 @@ def test_exampleForest_check():
# exampleRand
Pr
,
Rr
=
exampleRand
(
STATES
,
ACTIONS
)
def
test_exampleRand_dense_P_shape
():
assert
(
Pr
.
shape
==
(
ACTIONS
,
STATES
,
STATES
))
...
...
@@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape():
def
test_exampleRand_dense_check
():
assert
check
(
Pr
,
Rr
)
==
None
Prs
,
Rrs
=
exampleRand
(
STATES
,
ACTIONS
,
is_sparse
=
True
)
def
test_exampleRand_sparse_P_shape
():
assert
(
Prs
.
shape
==
(
ACTIONS
,
))
...
...
@@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape():
def
test_exampleRand_sparse_check
():
assert
check
(
Prs
,
Rrs
)
==
None
P
=
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
R
=
array
([[
5
,
10
],
[
-
1
,
2
]])
# MDP
def
test_MDP_P_R_1
():
...
...
@@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest():
# ValueIterationGS
def
test_ValueIterationGS_boundIter_exampleForest
():
a
=
ValueIterationGS
(
Pf
,
Rf
,
0.9
)
itr
=
39
assert
(
a
.
max_iter
==
itr
)
def
test_ValueIterationGS_exampleForest
():
a
=
ValueIterationGS
(
Pf
,
Rf
,
0.9
)
p
=
matrix
(
'0 0 0'
)
...
...
@@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest():
assert
a
.
iter
==
itr
assert
(
absolute
(
array
(
a
.
V
)
-
v
)
<
SMALLNUM
).
all
()
# RelativeValueIteration
def
test_RelativeValueIteration_exampleForest
():
a
=
RelativeValueIteration
(
Pf
,
Rf
)
itr
=
4
p
=
matrix
(
'0 0 0'
)
v
=
matrix
(
'-4.360000000000000 -0.760000000000000 3.240000000000000'
)
a
.
iterate
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
assert
(
absolute
(
array
(
a
.
V
)
-
v
)
<
SMALLNUM
).
all
()
#def test_JacksCarRental():
# S = 21 ** 2
# A = 11
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment