Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
0660caca
Commit
0660caca
authored
Jan 25, 2013
by
Steven Cordwell
Browse files
completed RelativeValueIteration class to a useful state
parent
4f17fb5f
Changes
2
Hide whitespace changes
Inline
Side-by-side
mdp.py
View file @
0660caca
...
@@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1):
...
@@ -304,7 +304,6 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1):
array([[[ 0.1, 0.9, 0. ],
array([[[ 0.1, 0.9, 0. ],
[ 0.1, 0. , 0.9],
[ 0.1, 0. , 0.9],
[ 0.1, 0. , 0.9]],
[ 0.1, 0. , 0.9]],
[[ 1. , 0. , 0. ],
[[ 1. , 0. , 0. ],
[ 1. , 0. , 0. ],
[ 1. , 0. , 0. ],
[ 1. , 0. , 0. ]]])
[ 1. , 0. , 0. ]]])
...
@@ -429,12 +428,12 @@ def getSpan(W):
...
@@ -429,12 +428,12 @@ def getSpan(W):
class
MDP
(
object
):
class
MDP
(
object
):
"""The Markov Decision Problem Toolbox."""
"""The Markov Decision Problem Toolbox."""
def
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
,
max_iter
):
""""""
""""""
# if the discount is None then the algorithm is assumed to not use it
# if the discount is None then the algorithm is assumed to not use it
# in its computations
# in its computations
if
(
type
(
discount
)
i
s
int
)
or
(
type
(
discount
)
is
float
):
if
type
(
discount
)
i
n
(
int
,
float
):
if
(
discount
<=
0
)
or
(
discount
>
1
):
if
(
discount
<=
0
)
or
(
discount
>
1
):
raise
ValueError
(
mdperr
[
"discount_rng"
])
raise
ValueError
(
mdperr
[
"discount_rng"
])
else
:
else
:
...
@@ -448,8 +447,8 @@ class MDP(object):
...
@@ -448,8 +447,8 @@ class MDP(object):
# if the max_iter is None then the algorithm is assumed to not use it
# if the max_iter is None then the algorithm is assumed to not use it
# in its computations
# in its computations
if
(
type
(
max_iter
)
i
s
int
)
or
(
type
(
max_iter
)
is
float
):
if
type
(
max_iter
)
i
n
(
int
,
float
):
if
(
max_iter
<=
0
)
:
if
max_iter
<=
0
:
raise
ValueError
(
mdperr
[
"maxi_min"
])
raise
ValueError
(
mdperr
[
"maxi_min"
])
else
:
else
:
self
.
max_iter
=
max_iter
self
.
max_iter
=
max_iter
...
@@ -457,6 +456,13 @@ class MDP(object):
...
@@ -457,6 +456,13 @@ class MDP(object):
raise
ValueError
(
"PyMDPtoolbox: max_iter must be a positive real "
\
raise
ValueError
(
"PyMDPtoolbox: max_iter must be a positive real "
\
"number greater than zero."
)
"number greater than zero."
)
if
type
(
epsilon
)
in
(
int
,
float
):
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
elif
not
epsilon
is
None
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
\
"number greater than zero."
)
# we run a check on P and R to make sure they are describing an MDP. If
# we run a check on P and R to make sure they are describing an MDP. If
# an exception isn't raised then they are assumed to be correct.
# an exception isn't raised then they are assumed to be correct.
check
(
transitions
,
reward
)
check
(
transitions
,
reward
)
...
@@ -744,7 +750,7 @@ class PolicyIteration(MDP):
...
@@ -744,7 +750,7 @@ class PolicyIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
policy0
=
None
,
max_iter
=
1000
,
eval_type
=
0
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
policy0
=
None
,
max_iter
=
1000
,
eval_type
=
0
):
""""""
""""""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
)
if
policy0
==
None
:
if
policy0
==
None
:
# initialise the policy to the one which maximises the expected
# initialise the policy to the one which maximises the expected
...
@@ -913,7 +919,7 @@ class PolicyIteration(MDP):
...
@@ -913,7 +919,7 @@ class PolicyIteration(MDP):
Ppolicy
,
Rpolicy
=
self
.
computePpolicyPRpolicy
()
Ppolicy
,
Rpolicy
=
self
.
computePpolicyPRpolicy
()
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self
.
V
=
self
.
lin_eq
((
self
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
Rpolicy
)
self
.
V
=
self
.
lin_eq
((
self
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
),
Rpolicy
)
def
iterate
(
self
):
def
iterate
(
self
):
"""Run the policy iteration algorithm."""
"""Run the policy iteration algorithm."""
...
@@ -961,7 +967,7 @@ class PolicyIteration(MDP):
...
@@ -961,7 +967,7 @@ class PolicyIteration(MDP):
self
.
V
=
tuple
(
array
(
self
.
V
).
reshape
(
self
.
S
).
tolist
())
self
.
V
=
tuple
(
array
(
self
.
V
).
reshape
(
self
.
S
).
tolist
())
self
.
policy
=
tuple
(
array
(
self
.
policy
).
reshape
(
self
.
S
).
tolist
())
self
.
policy
=
tuple
(
array
(
self
.
policy
).
reshape
(
self
.
S
).
tolist
())
class
PolicyIterationModified
(
MDP
):
class
PolicyIterationModified
(
PolicyIteration
):
"""Resolution of discounted MDP with policy iteration algorithm
"""Resolution of discounted MDP with policy iteration algorithm
Arguments
Arguments
...
@@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP):
...
@@ -1002,10 +1008,16 @@ class PolicyIterationModified(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
):
""""""
""""""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
PolicyIteration
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
,
1
)
if
epsilon
<=
0
:
# PolicyIteration doesn't pass epsilon to MDP.__init__() so we will
raise
ValueError
(
"epsilon must be greater than 0"
)
# check it here
if
type
(
epsilon
)
in
(
int
,
float
):
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
else
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
\
"number greater than zero."
)
# computation of threshold of variation for V for an epsilon-optimal policy
# computation of threshold of variation for V for an epsilon-optimal policy
if
self
.
discount
!=
1
:
if
self
.
discount
!=
1
:
...
@@ -1128,7 +1140,7 @@ class QLearning(MDP):
...
@@ -1128,7 +1140,7 @@ class QLearning(MDP):
raise
ValueError
(
"PyMDPtoolbox: n_iter should be greater than 10000"
)
raise
ValueError
(
"PyMDPtoolbox: n_iter should be greater than 10000"
)
# after this n_iter will be known as self.max_iter
# after this n_iter will be known as self.max_iter
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
n_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
n_iter
)
# Initialisations
# Initialisations
self
.
Q
=
zeros
((
self
.
S
,
self
.
A
))
self
.
Q
=
zeros
((
self
.
S
,
self
.
A
))
...
@@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP):
...
@@ -1238,13 +1250,15 @@ class RelativeValueIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
epsilon
=
0.01
,
max_iter
=
1000
):
def
__init__
(
self
,
transitions
,
reward
,
epsilon
=
0.01
,
max_iter
=
1000
):
MDP
.
__init__
(
self
,
transitions
,
reward
,
None
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
None
,
epsilon
,
max_iter
)
if
epsilon
<=
0
:
self
.
epsilon
=
epsilon
print
(
'MDP Toolbox ERROR: epsilon must be upper than 0'
)
self
.
discount
=
1
self
.
U
=
zeros
(
self
.
S
,
1
)
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
self
.
gain
=
self
.
U
[
self
.
S
]
self
.
gain
=
0
# self.U[self.S]
self
.
average_reward
=
None
def
iterate
(
self
):
def
iterate
(
self
):
""""""
""""""
...
@@ -1259,29 +1273,33 @@ class RelativeValueIteration(MDP):
...
@@ -1259,29 +1273,33 @@ class RelativeValueIteration(MDP):
self
.
iter
=
self
.
iter
+
1
;
self
.
iter
=
self
.
iter
+
1
;
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
1
,
self
.
U
)
self
.
policy
,
Vnext
=
self
.
bellmanOperator
()
U
next
=
U
next
-
self
.
gain
V
next
=
V
next
-
self
.
gain
variation
=
getSpan
(
U
next
-
self
.
U
)
variation
=
getSpan
(
V
next
-
self
.
V
)
if
self
.
verbose
:
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
if
variation
<
self
.
epsilon
:
if
variation
<
self
.
epsilon
:
done
=
True
done
=
True
average_reward
=
self
.
gain
+
min
(
U
next
-
self
.
U
)
self
.
average_reward
=
self
.
gain
+
(
V
next
-
self
.
V
).
min
(
)
if
self
.
verbose
:
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped, epsilon-optimal policy found'
)
print
(
'MDP Toolbox : iterations stopped, epsilon-optimal policy found'
)
elif
self
.
iter
==
self
.
max_iter
:
elif
self
.
iter
==
self
.
max_iter
:
done
=
True
done
=
True
average_reward
=
self
.
gain
+
min
(
U
next
-
self
.
U
);
self
.
average_reward
=
self
.
gain
+
(
V
next
-
self
.
V
).
min
()
if
self
.
verbose
:
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped by maximum number of iteration condition'
)
print
(
'MDP Toolbox : iterations stopped by maximum number of iteration condition'
)
self
.
U
=
U
next
self
.
V
=
V
next
self
.
gain
=
self
.
U
(
self
.
S
)
self
.
gain
=
float
(
self
.
V
[
self
.
S
-
1
]
)
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
# store value and policy as tuples
self
.
V
=
tuple
(
self
.
V
.
getA1
().
tolist
())
self
.
policy
=
tuple
(
self
.
policy
.
getA1
().
tolist
())
class
ValueIteration
(
MDP
):
class
ValueIteration
(
MDP
):
"""
"""
...
@@ -1401,21 +1419,18 @@ class ValueIteration(MDP):
...
@@ -1401,21 +1419,18 @@ class ValueIteration(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
1000
,
initial_value
=
0
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
1000
,
initial_value
=
0
):
"""Resolution of discounted MDP with value iteration algorithm."""
"""Resolution of discounted MDP with value iteration algorithm."""
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
,
max_iter
)
# initialization of optional arguments
# initialization of optional arguments
if
(
initial_value
==
0
)
:
if
initial_value
==
0
:
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
else
:
else
:
if
(
not
initial_value
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
))
)
:
if
not
initial_value
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
)):
raise
ValueError
(
"PyMDPtoolbox: The initial value must be a vector of length S"
)
raise
ValueError
(
"PyMDPtoolbox: The initial value must be a vector of length S"
)
else
:
else
:
self
.
V
=
matrix
(
initial_value
)
self
.
V
=
matrix
(
initial_value
)
if
epsilon
<=
0
:
if
self
.
discount
<
1
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than 0"
)
if
(
self
.
discount
<
1
):
# compute a bound for the number of iterations and update the
# compute a bound for the number of iterations and update the
# stored value of self.max_iter
# stored value of self.max_iter
self
.
boundIter
(
epsilon
)
self
.
boundIter
(
epsilon
)
...
@@ -1464,7 +1479,7 @@ class ValueIteration(MDP):
...
@@ -1464,7 +1479,7 @@ class ValueIteration(MDP):
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
#self.V = Vprev
#self.V = Vprev
self
.
max_iter
=
ceil
(
max_iter
)
self
.
max_iter
=
int
(
ceil
(
max_iter
)
)
def
iterate
(
self
):
def
iterate
(
self
):
"""
"""
...
...
test_mdptoolbox.py
View file @
0660caca
...
@@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012
...
@@ -6,7 +6,8 @@ Created on Sun May 27 23:16:57 2012
"""
"""
from
mdp
import
check
,
checkSquareStochastic
,
exampleForest
,
exampleRand
,
MDP
from
mdp
import
check
,
checkSquareStochastic
,
exampleForest
,
exampleRand
,
MDP
from
mdp
import
PolicyIteration
,
ValueIteration
,
ValueIterationGS
from
mdp
import
PolicyIteration
,
RelativeValueIteration
,
ValueIteration
from
mdp
import
ValueIterationGS
from
numpy
import
absolute
,
array
,
eye
,
matrix
,
zeros
from
numpy
import
absolute
,
array
,
eye
,
matrix
,
zeros
from
numpy.random
import
rand
from
numpy.random
import
rand
...
@@ -18,6 +19,13 @@ STATES = 10
...
@@ -18,6 +19,13 @@ STATES = 10
ACTIONS
=
3
ACTIONS
=
3
SMALLNUM
=
10e-12
SMALLNUM
=
10e-12
# Arrays
P
=
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
R
=
array
([[
5
,
10
],
[
-
1
,
2
]])
Pf
,
Rf
=
exampleForest
()
Pr
,
Rr
=
exampleRand
(
STATES
,
ACTIONS
)
Prs
,
Rrs
=
exampleRand
(
STATES
,
ACTIONS
,
is_sparse
=
True
)
# check: square, stochastic and non-negative ndarrays
# check: square, stochastic and non-negative ndarrays
def
test_check_square_stochastic_nonnegative_array_1
():
def
test_check_square_stochastic_nonnegative_array_1
():
...
@@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse():
...
@@ -130,7 +138,6 @@ def test_checkSquareStochastic_eye_sparse():
assert
checkSquareStochastic
(
P
)
==
None
assert
checkSquareStochastic
(
P
)
==
None
# exampleForest
# exampleForest
Pf
,
Rf
=
exampleForest
()
def
test_exampleForest_P_shape
():
def
test_exampleForest_P_shape
():
assert
(
Pf
==
array
([[[
0.1
,
0.9
,
0.0
],
assert
(
Pf
==
array
([[[
0.1
,
0.9
,
0.0
],
...
@@ -151,8 +158,6 @@ def test_exampleForest_check():
...
@@ -151,8 +158,6 @@ def test_exampleForest_check():
# exampleRand
# exampleRand
Pr
,
Rr
=
exampleRand
(
STATES
,
ACTIONS
)
def
test_exampleRand_dense_P_shape
():
def
test_exampleRand_dense_P_shape
():
assert
(
Pr
.
shape
==
(
ACTIONS
,
STATES
,
STATES
))
assert
(
Pr
.
shape
==
(
ACTIONS
,
STATES
,
STATES
))
...
@@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape():
...
@@ -162,8 +167,6 @@ def test_exampleRand_dense_R_shape():
def
test_exampleRand_dense_check
():
def
test_exampleRand_dense_check
():
assert
check
(
Pr
,
Rr
)
==
None
assert
check
(
Pr
,
Rr
)
==
None
Prs
,
Rrs
=
exampleRand
(
STATES
,
ACTIONS
,
is_sparse
=
True
)
def
test_exampleRand_sparse_P_shape
():
def
test_exampleRand_sparse_P_shape
():
assert
(
Prs
.
shape
==
(
ACTIONS
,
))
assert
(
Prs
.
shape
==
(
ACTIONS
,
))
...
@@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape():
...
@@ -173,9 +176,6 @@ def test_exampleRand_sparse_R_shape():
def
test_exampleRand_sparse_check
():
def
test_exampleRand_sparse_check
():
assert
check
(
Prs
,
Rrs
)
==
None
assert
check
(
Prs
,
Rrs
)
==
None
P
=
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
R
=
array
([[
5
,
10
],
[
-
1
,
2
]])
# MDP
# MDP
def
test_MDP_P_R_1
():
def
test_MDP_P_R_1
():
...
@@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest():
...
@@ -298,6 +298,11 @@ def test_PolicyIteration_matrix_exampleForest():
# ValueIterationGS
# ValueIterationGS
def
test_ValueIterationGS_boundIter_exampleForest
():
a
=
ValueIterationGS
(
Pf
,
Rf
,
0.9
)
itr
=
39
assert
(
a
.
max_iter
==
itr
)
def
test_ValueIterationGS_exampleForest
():
def
test_ValueIterationGS_exampleForest
():
a
=
ValueIterationGS
(
Pf
,
Rf
,
0.9
)
a
=
ValueIterationGS
(
Pf
,
Rf
,
0.9
)
p
=
matrix
(
'0 0 0'
)
p
=
matrix
(
'0 0 0'
)
...
@@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest():
...
@@ -308,6 +313,18 @@ def test_ValueIterationGS_exampleForest():
assert
a
.
iter
==
itr
assert
a
.
iter
==
itr
assert
(
absolute
(
array
(
a
.
V
)
-
v
)
<
SMALLNUM
).
all
()
assert
(
absolute
(
array
(
a
.
V
)
-
v
)
<
SMALLNUM
).
all
()
# RelativeValueIteration
def
test_RelativeValueIteration_exampleForest
():
a
=
RelativeValueIteration
(
Pf
,
Rf
)
itr
=
4
p
=
matrix
(
'0 0 0'
)
v
=
matrix
(
'-4.360000000000000 -0.760000000000000 3.240000000000000'
)
a
.
iterate
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
assert
(
absolute
(
array
(
a
.
V
)
-
v
)
<
SMALLNUM
).
all
()
#def test_JacksCarRental():
#def test_JacksCarRental():
# S = 21 ** 2
# S = 21 ** 2
# A = 11
# A = 11
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment