Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pymdptoolbox
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
5f7f035b
Commit
5f7f035b
authored
Jan 24, 2013
by
Steven Cordwell
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixes and unittest fixes for ValueIteration and PolicyIteration classes
parent
fa69c0d3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
139 additions
and
65 deletions
+139
-65
mdp.py
mdp.py
+63
-46
test_mdptoolbox.py
test_mdptoolbox.py
+76
-19
No files found.
mdp.py
View file @
5f7f035b
...
...
@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from
numpy
import
absolute
,
array
,
diag
,
matrix
,
mean
,
mod
,
multiply
,
ndarray
from
numpy
import
ones
,
zeros
from
numpy
import
nonzero
,
ones
,
zeros
from
numpy.random
import
rand
from
math
import
ceil
,
log
,
sqrt
from
random
import
randint
,
random
...
...
@@ -419,6 +419,13 @@ def exampleRand(S, A, is_sparse=False, mask=None):
return
(
P
,
R
)
def
getSpan
(
self
,
W
):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return
(
W
.
max
()
-
W
.
min
())
class
MDP
(
object
):
"""The Markov Decision Problem Toolbox."""
...
...
@@ -502,27 +509,31 @@ class MDP(object):
Ppolicy(SxS) = transition matrix for policy
PRpolicy(S) = reward matrix for policy
"""
Ppolicy
=
zeros
(())
PRpolicy
=
zeros
(())
for
a
in
range
(
self
.
A
):
# avoid looping over S
ind
=
find
(
self
.
policy
==
a
)
# the rows that use action a
if
not
isempty
(
ind
):
if
iscell
(
P
):
Ppolicy
[
ind
,:]
=
self
.
P
[
a
][
ind
,:]
else
:
Ppolicy
[
ind
,:]
=
self
.
P
[
ind
,:,
a
]
Ppolicy
=
matrix
(
zeros
((
self
.
S
,
self
.
S
)))
Rpolicy
=
matrix
(
zeros
((
self
.
S
,
1
)))
for
aa
in
range
(
self
.
A
):
# avoid looping over S
# the rows that use action a. .getA1() is used to make sure that
# ind is a 1 dimensional vector
ind
=
nonzero
(
self
.
policy
==
aa
)[
0
].
getA1
()
if
ind
.
size
>
0
:
# if no rows use action a, then no point continuing
Ppolicy
[
ind
,
:]
=
self
.
P
[
aa
][
ind
,
:]
PR
=
self
.
computePR
()
PRpolicy
[
ind
,
1
]
=
PR
[
ind
,
a
]
# self.R cannot be sparse with the code in its current condition
#PR = self.computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy
[
ind
]
=
self
.
R
[
ind
,
aa
]
# self.R cannot be sparse with the code in its current condition, but
# it should be possible in the future. Also, if R is so big that its
# a good idea to use a sparse matrix for it, then converting PRpolicy
# from a dense to sparse matrix doesn't seem very memory efficient
if
type
(
self
.
R
)
is
sparse
:
PRpolicy
=
sparse
(
P
Rpolicy
)
Rpolicy
=
sparse
(
Rpolicy
)
#self.Ppolicy = Ppolicy
#self.Rpolicy =
P
Rpolicy
return
(
Ppolicy
,
P
Rpolicy
)
#self.Rpolicy = Rpolicy
return
(
Ppolicy
,
Rpolicy
)
def
computePR
(
self
,
P
,
R
):
"""Computes the reward for the system in one state chosing an action
...
...
@@ -584,13 +595,6 @@ class MDP(object):
"""
raise
NotImplementedError
(
"You should create an iterate() method."
)
def
getSpan
(
self
,
W
):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return
(
W
.
max
()
-
W
.
min
())
def
setSilent
(
self
):
"""Ask for running resolution functions of the MDP Toolbox in silent
mode.
...
...
@@ -804,6 +808,8 @@ class PolicyIteration(MDP):
if
eval_type
in
(
0
,
"matrix"
):
from
numpy.linalg
import
solve
from
scipy.sparse
import
eye
self
.
speye
=
eye
self
.
lin_eq
=
solve
self
.
eval_type
=
"matrix"
elif
eval_type
in
(
1
,
"iterative"
):
...
...
@@ -844,11 +850,11 @@ class PolicyIteration(MDP):
epsilon-optimum value function found or maximum number of iterations reached.
"""
if
V0
==
0
:
Vpolicy
=
zeros
(
self
.
S
,
1
)
policy_V
=
zeros
((
self
.
S
,
1
)
)
else
:
raise
NotImplementedError
(
"evalPolicyIterative: case V0 != 0 not implemented. Use V0=0 instead."
)
Ppolicy
,
PRpolicy
=
self
.
computePpolicyPRpolicy
(
self
.
P
,
self
.
R
,
self
.
policy
)
policy_P
,
policy_R
=
self
.
computePpolicyPRpolicy
(
)
if
self
.
verbose
:
print
(
' Iteration V_variation'
)
...
...
@@ -857,21 +863,24 @@ class PolicyIteration(MDP):
done
=
False
while
not
done
:
itr
=
itr
+
1
Vprev
=
Vpolicy
Vpolicy
=
PRpolicy
+
self
.
discount
*
Ppolicy
*
Vprev
variation
=
max
(
abs
(
Vpolicy
-
Vprev
))
Vprev
=
policy_V
policy_V
=
policy_R
+
self
.
discount
*
policy_P
*
Vprev
variation
=
absolute
(
policy_V
-
Vprev
).
max
()
if
self
.
verbose
:
print
(
' %s %s'
)
%
(
itr
,
variation
)
if
variation
<
((
1
-
self
.
discount
)
/
self
.
discount
)
*
epsilon
:
# to ensure |Vn - Vpolicy| < epsilon
done
=
True
if
self
.
verbose
:
print
(
'
MDP T
oolbox: iterations stopped, epsilon-optimal value function'
)
print
(
'
PyMDPt
oolbox: iterations stopped, epsilon-optimal value function'
)
elif
itr
==
max_iter
:
done
=
True
if
self
.
verbose
:
print
(
'
MDP T
oolbox: iterations stopped by maximum number of iteration condition'
)
print
(
'
PyMDPt
oolbox: iterations stopped by maximum number of iteration condition'
)
self
.
value
=
Vpolicy
self
.
value
=
policy_V
def
evalPolicyMatrix
(
self
):
"""Evaluation of the value function of a policy
...
...
@@ -894,12 +903,12 @@ class PolicyIteration(MDP):
Vpolicy(S) = value function of the policy
"""
Ppolicy
,
PRpolicy
=
self
.
computePpolicyPRpolicy
(
self
.
P
,
self
.
R
,
self
.
policy
)
Ppolicy
,
Rpolicy
=
self
.
computePpolicyPRpolicy
(
)
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self
.
value
=
self
.
lin_eq
((
s
peye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
P
Rpolicy
)
self
.
value
=
self
.
lin_eq
((
s
elf
.
speye
(
self
.
S
,
self
.
S
)
-
self
.
discount
*
Ppolicy
)
,
Rpolicy
)
def
iterate
(
self
):
""""""
"""
Run the policy iteration algorithm.
"""
if
self
.
verbose
:
print
(
' Iteration Number_of_different_actions'
)
...
...
@@ -927,8 +936,16 @@ class PolicyIteration(MDP):
if
self
.
verbose
:
print
(
' %s %s'
)
%
(
self
.
iter
,
n_different
)
if
(
policy_next
==
self
.
policy
).
all
()
or
(
self
.
iter
==
self
.
max_iter
)
:
if
n_different
==
0
:
done
=
True
if
self
.
verbose
:
print
(
"...iterations stopped, unchanging policy found"
)
elif
(
self
.
iter
==
self
.
max_iter
):
done
=
True
if
self
.
verbose
:
print
(
"...iterations stopped by maximum number of iteration condition"
)
else
:
self
.
policy
=
policy_next
self
.
time
=
time
()
-
self
.
time
...
...
@@ -1009,7 +1026,7 @@ class PolicyIterationModified(MDP):
Vnext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
PR
,
self
.
discount
,
self
.
V
)
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation
=
self
.
getSpan
(
Vnext
-
self
.
value
);
variation
=
getSpan
(
Vnext
-
self
.
value
);
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1237,7 +1254,7 @@ class RelativeValueIteration(MDP):
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
1
,
self
.
U
)
Unext
=
Unext
-
self
.
gain
variation
=
self
.
getSpan
(
Unext
-
self
.
U
)
variation
=
getSpan
(
Unext
-
self
.
U
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1382,8 +1399,8 @@ class ValueIteration(MDP):
if
(
initial_value
==
0
):
self
.
value
=
matrix
(
zeros
((
self
.
S
,
1
)))
else
:
if
(
initial_value
.
size
!=
self
.
S
):
raise
ValueError
(
"The initial value must be length S"
)
if
(
not
initial_value
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
))
):
raise
ValueError
(
"The initial value must be
a vector of
length S"
)
else
:
self
.
value
=
matrix
(
initial_value
)
...
...
@@ -1431,10 +1448,10 @@ class ValueIteration(MDP):
k
=
1
-
h
.
sum
()
Vprev
=
self
.
value
self
.
bellmanOperator
()
null
,
value
=
self
.
bellmanOperator
()
# p 201, Proposition 6.6.5
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
self
.
getSpan
(
self
.
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
self
.
value
=
Vprev
max_iter
=
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
)
#
self.value = Vprev
self
.
max_iter
=
ceil
(
max_iter
)
...
...
@@ -1458,7 +1475,7 @@ class ValueIteration(MDP):
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# finds the maximum of the the rows. (Operates along the columns?)
variation
=
self
.
getSpan
(
self
.
value
-
Vprev
)
variation
=
getSpan
(
self
.
value
-
Vprev
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
@@ -1566,7 +1583,7 @@ class ValueIterationGS(ValueIteration):
Q
[
a
]
=
self
.
R
[
s
,
a
]
+
self
.
discount
*
self
.
P
[
a
][
s
,:]
*
self
.
value
self
.
value
[
s
]
=
max
(
Q
)
variation
=
self
.
getSpan
(
V
-
Vprev
)
variation
=
getSpan
(
V
-
Vprev
)
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
...
...
test_mdptoolbox.py
View file @
5f7f035b
...
...
@@ -172,10 +172,10 @@ R = array([[5, 10], [-1, 2]])
# MDP
def
test_MDP_P_R_1
():
P1
=
zeros
((
ACTIONS
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
[[
0.5
,
0.5
],[
0.8
,
0.2
]]
)
P1
[
1
]
=
matrix
(
[[
0
,
1
],[
0.1
,
0.9
]]
)
R1
=
matrix
(
[[
5
,
10
],
[
-
1
,
2
]]
)
P1
=
zeros
((
2
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
'0.5 0.5; 0.8 0.2'
)
P1
[
1
]
=
matrix
(
'0 1; 0.1 0.9'
)
R1
=
matrix
(
'5 10; -1 2'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
a
.
P
.
dtype
==
P1
.
dtype
assert
a
.
R
.
dtype
==
R1
.
dtype
...
...
@@ -185,10 +185,10 @@ def test_MDP_P_R_1():
def
test_MDP_P_R_2
():
R
=
array
([[[
5
,
10
],
[
-
1
,
2
]],
[[
1
,
2
],
[
3
,
4
]]])
P1
=
zeros
((
ACTIONS
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
[[
0.5
,
0.5
],[
0.8
,
0.2
]]
)
P1
[
1
]
=
matrix
(
[[
0
,
1
],[
0.1
,
0.9
]]
)
R1
=
matrix
(
[[
7.5
,
2
],
[
-
0.4
,
3.9
]]
)
P1
=
zeros
((
2
,
),
dtype
=
object
)
P1
[
0
]
=
matrix
(
'0.5 0.5; 0.8 0.2'
)
P1
[
1
]
=
matrix
(
'0 1; 0.1 0.9'
)
R1
=
matrix
(
'7.5 2; -0.4 3.9'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
type
(
a
.
P
)
==
type
(
P1
)
assert
type
(
a
.
R
)
==
type
(
R1
)
...
...
@@ -201,7 +201,7 @@ def test_MDP_P_R_2():
def
test_MDP_P_R_3
():
P
=
array
([[[
0.6116
,
0.3884
],[
0
,
1
]],[[
0.6674
,
0.3326
],[
0
,
1
]]])
R
=
array
([[[
-
0.2433
,
0.7073
],[
0
,
0.1871
]],[[
-
0.0069
,
0.6433
],[
0
,
0.2898
]]])
PR
=
matrix
(
[[
0.12591304
,
0.20935652
],
[
0.1871
,
0.2898
]]
)
PR
=
matrix
(
'0.12591304 0.20935652; 0.1871 0.2898'
)
a
=
MDP
(
P
,
R
,
0.9
,
0.01
)
assert
(
absolute
(
a
.
R
-
PR
)
<
SMALLNUM
).
all
()
...
...
@@ -229,16 +229,73 @@ def test_ValueIteration_exampleForest():
def
test_PolicyIteration_init_policy0
():
a
=
PolicyIteration
(
P
,
R
,
0.9
)
p
=
array
((
1
,
1
)).
reshape
(
2
,
1
)
assert
(
absolute
(
a
.
policy
-
p
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration
():
PolicyIteration
(
P
,
R
,
0.9
)
#inst.iterate()
#assert (abs(inst.value[0] - 42.4419) < 0.001)
#assert (abs(inst.value[1] - 36.0465) < 0.001)
#assert (inst.policy == (1, 0))
#assert (inst.iter == 2)
p
=
matrix
(
'1; 1'
)
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_init_policy0_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
p
=
matrix
(
'0; 1; 0'
)
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_computePpolicyPRpolicy_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
P1
=
matrix
(
'0.1 0.9 0; 1 0 0; 0.1 0 0.9'
)
R1
=
matrix
(
'0; 1; 4'
)
Ppolicy
,
Rpolicy
=
a
.
computePpolicyPRpolicy
()
assert
(
absolute
(
Ppolicy
-
P1
)
<
SMALLNUM
).
all
()
assert
(
absolute
(
Rpolicy
-
R1
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_evalPolicyIterative_exampleForest
():
P
,
R
=
exampleForest
()
v0
=
matrix
(
'0; 0; 0'
)
v1
=
matrix
(
'4.47504640074458; 5.02753258879703; 23.17234211944304'
)
p
=
matrix
(
'0; 1; 0'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
assert
(
absolute
(
a
.
value
-
v0
)
<
SMALLNUM
).
all
()
a
.
evalPolicyIterative
()
assert
(
absolute
(
a
.
value
-
v1
)
<
SMALLNUM
).
all
()
assert
(
a
.
policy
==
p
).
all
()
def
test_PolicyIteration_evalPolicyIterative_bellmanOperator_exampleForest
():
P
,
R
=
exampleForest
()
v
=
matrix
(
'4.47504640074458; 5.02753258879703; 23.17234211944304'
)
p
=
matrix
(
'0; 0; 0'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
a
.
evalPolicyIterative
()
policy
,
value
=
a
.
bellmanOperator
()
assert
(
policy
==
p
).
all
()
assert
(
absolute
(
a
.
value
-
v
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_iterative_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
,
eval_type
=
1
)
V
=
matrix
(
'26.2439058351861 29.4839058351861 33.4839058351861'
)
p
=
matrix
(
'0 0 0'
)
itr
=
2
a
.
iterate
()
assert
(
absolute
(
array
(
a
.
value
)
-
V
)
<
SMALLNUM
).
all
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
def
test_PolicyIteration_evalPolicyMatrix_exampleForest
():
P
,
R
=
exampleForest
()
v_pol
=
matrix
(
'4.47513812154696; 5.02762430939227; 23.17243384704857'
)
a
=
PolicyIteration
(
P
,
R
,
0.9
)
a
.
evalPolicyMatrix
()
assert
(
absolute
(
a
.
value
-
v_pol
)
<
SMALLNUM
).
all
()
def
test_PolicyIteration_matrix_exampleForest
():
P
,
R
=
exampleForest
()
a
=
PolicyIteration
(
P
,
R
,
0.9
)
V
=
matrix
(
'26.2440000000000 29.4840000000000 33.4840000000000'
)
p
=
matrix
(
'0 0 0'
)
itr
=
2
a
.
iterate
()
assert
(
absolute
(
array
(
a
.
value
)
-
V
)
<
SMALLNUM
).
all
()
assert
(
array
(
a
.
policy
)
==
p
).
all
()
assert
a
.
iter
==
itr
#def test_JacksCarRental():
# S = 21 ** 2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment