Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
985aaac9
Commit
985aaac9
authored
Feb 10, 2013
by
Steven Cordwell
Browse files
progress so far
parent
e5fd8d1e
Changes
2
Hide whitespace changes
Inline
Side-by-side
TODO
View file @
985aaac9
...
...
@@ -16,4 +16,4 @@
7. Try to use rows for the Bellman computations rather than columns (perhaps
this storage is more Pythonic? or more NumPyic?)
8.
8.
Create the mask as a sparse matrix in exampleRand
mdp.py
View file @
985aaac9
...
...
@@ -542,27 +542,28 @@ def exampleRand(S, A, is_sparse=False, mask=None):
# generate the transition and reward matrices based on S, A and mask
if
is_sparse
:
# definition of transition matrix : square stochastic matrix
P
=
empty
(
A
,
dtype
=
object
)
P
=
[]
# definition of reward matrix (values between -1 and +1)
R
=
empty
(
A
,
dtype
=
object
)
for
a
in
range
(
A
):
R
=
[]
for
a
in
x
range
(
A
):
if
mask
.
ndim
==
3
:
PP
=
mask
[
a
,
:,
:]
*
rand
(
S
,
S
)
for
s
in
range
(
S
):
if
mask
[
a
,
s
,
:].
sum
()
==
0
:
PP
[
s
,
randint
(
0
,
S
-
1
)]
=
1
PP
[
s
,
:]
=
PP
[
s
,
:]
/
PP
[
s
,
:].
sum
()
P
[
a
]
=
sparse
(
PP
)
R
[
a
]
=
sparse
(
mask
[
a
,
:,
:
]
*
(
2
*
rand
(
S
,
S
)
-
ones
((
S
,
S
))))
P
.
append
(
sparse
(
PP
)
)
R
.
append
(
sparse
(
mask
[
a
]
*
(
2
*
rand
(
S
,
S
)
-
ones
((
S
,
S
))))
)
else
:
PP
=
mask
*
rand
(
S
,
S
)
for
s
in
range
(
S
):
if
mask
[
s
,
:].
sum
()
==
0
:
PP
[
s
,
randint
(
0
,
S
-
1
)]
=
1
PP
[
s
,
:]
=
PP
[
s
,
:]
/
PP
[
s
,
:].
sum
()
P
[
a
]
=
sparse
(
PP
)
R
[
a
]
=
sparse
(
mask
*
(
2
*
rand
(
S
,
S
)
-
ones
((
S
,
S
))))
P
.
append
(
sparse
(
PP
))
R
.
append
(
sparse
(
mask
*
(
2
*
rand
(
S
,
S
)
-
ones
((
S
,
S
)))))
P
=
tuple
(
P
)
R
=
tuple
(
R
)
else
:
# definition of transition matrix : square stochastic matrix
P
=
zeros
((
A
,
S
,
S
))
...
...
@@ -694,7 +695,9 @@ class MDP(object):
# set the initial iteration count to zero
self
.
iter
=
0
# V should be stored as a vector ie shape of (S,) or (1, S)
self
.
V
=
None
# policy can also be stored as a vector
self
.
policy
=
None
def
_bellmanOperator
(
self
,
V
=
None
):
...
...
@@ -712,14 +715,14 @@ class MDP(object):
V
=
self
.
V
else
:
try
:
if
V
.
shape
!=
(
self
.
S
,
1
):
if
V
.
shape
not
in
((
self
.
S
,),
(
1
,
self
.
S
)
):
raise
ValueError
(
"bellman: V is not the right shape."
)
except
AttributeError
:
raise
TypeError
(
"bellman: V must be a numpy array or matrix."
)
Q
=
empty
((
self
.
S
,
self
.
A
))
for
aa
in
range
(
self
.
A
):
Q
[:,
aa
]
=
self
.
R
[
aa
]
+
(
self
.
discount
*
self
.
P
[
aa
].
dot
(
V
)
)
Q
[:,
aa
]
=
self
.
R
[
aa
]
+
self
.
discount
*
self
.
P
[
aa
].
dot
(
V
)
# Which way is better?
# 1. Return, (policy, value)
...
...
@@ -766,19 +769,24 @@ class MDP(object):
for
aa
in
xrange
(
self
.
A
):
self
.
P
.
append
(
P
[
aa
])
self
.
P
=
tuple
(
self
.
P
)
# Set self.R as a tuple of length A, with each element storing an
S×1
# vector
array
.
# Set self.R as a tuple of length A, with each element storing an
1×S
# vector.
try
:
if
R
.
ndim
==
2
:
self
.
R
=
[]
for
aa
in
xrange
(
self
.
A
):
self
.
R
.
append
(
R
[:,
aa
].
reshape
(
self
.
S
,
1
))
self
.
R
.
append
(
array
(
R
[:,
aa
]
)
.
reshape
(
self
.
S
))
else
:
raise
AttributeError
except
AttributeError
:
self
.
R
=
[]
for
aa
in
xrange
(
self
.
A
):
self
.
R
.
append
(
multiply
(
P
[
aa
],
R
[
aa
]).
sum
(
1
).
reshape
(
self
.
S
,
1
))
try
:
self
.
R
.
append
(
P
[
aa
].
multiply
(
R
[
aa
]).
sum
(
1
).
reshape
(
self
.
S
))
except
AttributeError
:
self
.
R
.
append
(
multiply
(
P
[
aa
],
R
[
aa
]).
sum
(
1
).
reshape
(
self
.
S
))
except
:
raise
except
:
raise
self
.
R
=
tuple
(
self
.
R
)
...
...
@@ -854,17 +862,18 @@ class FiniteHorizon(MDP):
raise
ValueError
(
'PyMDPtoolbox: N must be greater than 0'
)
else
:
self
.
N
=
N
# Initialise the base class
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
None
)
#
remove the iteration counter
# remove the iteration counter, it is not meaningful for backwards
#
induction
del
self
.
iter
# There are value vectors for each time step up to the horizon
self
.
V
=
zeros
((
self
.
S
,
N
+
1
))
self
.
policy
=
zeros
((
self
.
S
,
N
),
dtype
=
int
)
if
not
h
is
None
:
# There are policy vectors for each time step before the horizon, when
# we reach the horizon we don't need to make decisions anymore.
self
.
policy
=
empty
((
self
.
S
,
N
),
dtype
=
int
)
# Set the reward for the final transition to h, if specified.
if
h
is
not
None
:
self
.
V
[:,
N
]
=
h
def
iterate
(
self
):
...
...
@@ -872,15 +881,24 @@ class FiniteHorizon(MDP):
self
.
time
=
time
()
for
n
in
range
(
self
.
N
):
W
,
X
=
self
.
_bellmanOperator
(
matrix
(
self
.
V
[:,
self
.
N
-
n
]).
reshape
(
self
.
S
,
1
))
self
.
V
[:,
self
.
N
-
n
-
1
]
=
X
.
A1
self
.
policy
[:,
self
.
N
-
n
-
1
]
=
W
.
A1
W
,
X
=
self
.
_bellmanOperator
(
self
.
V
[:,
self
.
N
-
n
])
self
.
V
[:,
self
.
N
-
n
-
1
]
=
X
self
.
policy
[:,
self
.
N
-
n
-
1
]
=
W
if
self
.
verbose
:
print
(
"stage: %s ... policy transpose : %s"
)
%
(
self
.
N
-
n
,
self
.
policy
[:,
self
.
N
-
n
-
1
].
tolist
())
self
.
time
=
time
()
-
self
.
time
# After this we could create a tuple of tuples for the values and
# policies.
#V = []
#p = []
#for n in xrange(self.N):
# V.append()
# p.append()
#V.append()
#self.V = tuple(V)
#self.policy = tuple(p)
class
LP
(
MDP
):
...
...
@@ -957,7 +975,7 @@ class LP(MDP):
# (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the
# default cvxopt solver) then V agrees with the Octave equivalent
# only to 10e-8 places.
self
.
V
=
matrix
(
self
.
_linprog
(
f
,
M
,
-
h
,
solver
=
'glpk'
)[
'x'
])
self
.
V
=
array
(
self
.
_linprog
(
f
,
M
,
-
h
,
solver
=
'glpk'
)[
'x'
])
self
.
policy
,
self
.
V
=
self
.
_bellmanOperator
()
...
...
@@ -1019,7 +1037,7 @@ class PolicyIteration(MDP):
if
policy0
==
None
:
# initialise the policy to the one which maximises the expected
# immediate reward
self
.
V
=
matrix
(
zeros
(
(
self
.
S
,
1
))
)
self
.
V
=
zeros
(
self
.
S
)
self
.
policy
,
null
=
self
.
_bellmanOperator
()
del
null
else
:
...
...
@@ -1029,7 +1047,7 @@ class PolicyIteration(MDP):
raise
ValueError
(
"PyMDPtolbox: policy0 must a vector with "
"length S."
)
policy0
=
matrix
(
policy0
.
reshape
(
self
.
S
,
1
)
)
policy0
=
policy0
.
reshape
(
self
.
S
)
if
(
mod
(
policy0
,
1
).
any
()
or
(
policy0
<
0
).
any
()
or
(
policy0
>=
self
.
S
).
any
()):
...
...
@@ -1039,7 +1057,7 @@ class PolicyIteration(MDP):
self
.
policy
=
policy0
# set or reset the initial values to zero
self
.
V
=
matrix
(
zeros
(
(
self
.
S
,
1
))
)
self
.
V
=
zeros
(
self
.
S
)
if
eval_type
in
(
0
,
"matrix"
):
from
numpy.linalg
import
solve
...
...
@@ -1076,29 +1094,24 @@ class PolicyIteration(MDP):
PRpolicy(S) = reward matrix for policy
"""
Ppolicy
=
matrix
(
zeros
((
self
.
S
,
self
.
S
))
)
Rpolicy
=
matrix
(
zeros
(
(
self
.
S
,
1
))
)
Ppolicy
=
empty
((
self
.
S
,
self
.
S
))
Rpolicy
=
zeros
(
self
.
S
)
for
aa
in
range
(
self
.
A
):
# avoid looping over S
# the rows that use action a. .getA1() is used to make sure that
# ind is a 1 dimensional vector
ind
=
(
self
.
policy
==
aa
).
nonzero
()[
0
].
getA1
()
# the rows that use action a.
ind
=
(
self
.
policy
==
aa
).
nonzero
()[
0
]
# if no rows use action a, then no need to assign this
if
ind
.
size
>
0
:
Ppolicy
[
ind
,
:]
=
self
.
P
[
aa
][
ind
,
:]
#PR = self._computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy
[
ind
]
=
self
.
R
[
aa
][
ind
]
# self.R cannot be sparse with the code in its current condition, but
# it should be possible in the future. Also, if R is so big that its
# a good idea to use a sparse matrix for it, then converting PRpolicy
# from a dense to sparse matrix doesn't seem very memory efficient
if
type
(
self
.
R
)
is
sparse
:
Rpolicy
=
sparse
(
Rpolicy
)
#self.Ppolicy = Ppolicy
#self.Rpolicy = Rpolicy
return
(
Ppolicy
,
Rpolicy
)
...
...
@@ -1136,7 +1149,7 @@ class PolicyIteration(MDP):
"""
if
(
type
(
V0
)
in
(
int
,
float
))
and
(
V0
==
0
):
policy_V
=
zeros
(
(
self
.
S
,
1
)
)
policy_V
=
zeros
(
self
.
S
)
else
:
if
(
type
(
V0
)
in
(
ndarray
,
matrix
))
and
(
V0
.
shape
==
(
self
.
S
,
1
)):
policy_V
=
V0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment