Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
5c5ba863
Commit
5c5ba863
authored
Jan 26, 2013
by
Steven Cordwell
Browse files
bellmanOperator and computePR converted to leading underscore internal use name
parent
29fda10c
Changes
1
Hide whitespace changes
Inline
Side-by-side
mdp.py
View file @
5c5ba863
...
...
@@ -465,7 +465,7 @@ class MDP(object):
check
(
transitions
,
reward
)
# computePR will assign the variables self.S, self.A, self.P and self.R
self
.
computePR
(
transitions
,
reward
)
self
.
_
computePR
(
transitions
,
reward
)
# the verbosity is by default turned off
self
.
verbose
=
False
...
...
@@ -477,7 +477,7 @@ class MDP(object):
self
.
V
=
None
self
.
policy
=
None
def
bellmanOperator
(
self
,
V
=
None
):
def
_
bellmanOperator
(
self
,
V
=
None
):
"""Apply the Bellman operator on the value function.
Updates the value function and the Vprev-improving policy.
...
...
@@ -507,7 +507,7 @@ class MDP(object):
# self.V = Q.max(axis=1)
# self.policy = Q.argmax(axis=1)
def
computePR
(
self
,
P
,
R
):
def
_
computePR
(
self
,
P
,
R
):
"""Compute the reward for the system in one state chosing an action.
Arguments
...
...
@@ -634,7 +634,7 @@ class FiniteHorizon(MDP):
self
.
time
=
time
()
for
n
in
range
(
self
.
N
):
W
,
X
=
self
.
bellmanOperator
(
W
,
X
=
self
.
_
bellmanOperator
(
matrix
(
self
.
V
[:,
self
.
N
-
n
]).
reshape
(
self
.
S
,
1
))
self
.
V
[:,
self
.
N
-
n
-
1
]
=
X
.
A1
self
.
policy
[:,
self
.
N
-
n
-
1
]
=
W
.
A1
...
...
@@ -726,7 +726,7 @@ class LP(MDP):
# only to 10e-8 places.
self
.
V
=
matrix
(
self
.
linprog
(
self
.
f
,
self
.
M
,
-
h
,
solver
=
'glpk'
)[
'x'
])
self
.
policy
,
self
.
V
=
self
.
bellmanOperator
()
self
.
policy
,
self
.
V
=
self
.
_
bellmanOperator
()
self
.
time
=
time
()
-
self
.
time
...
...
@@ -787,7 +787,7 @@ class PolicyIteration(MDP):
# initialise the policy to the one which maximises the expected
# immediate reward
self
.
V
=
matrix
(
zeros
((
self
.
S
,
1
)))
self
.
policy
,
null
=
self
.
bellmanOperator
()
self
.
policy
,
null
=
self
.
_
bellmanOperator
()
del
null
else
:
policy0
=
array
(
policy0
)
...
...
@@ -854,7 +854,7 @@ class PolicyIteration(MDP):
if
ind
.
size
>
0
:
Ppolicy
[
ind
,
:]
=
self
.
P
[
aa
][
ind
,
:]
#PR = self.computePR() # an apparently uneeded line, and
#PR = self.
_
computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy
[
ind
]
=
self
.
R
[
ind
,
aa
]
...
...
@@ -991,7 +991,7 @@ class PolicyIteration(MDP):
# This should update the classes policy attribute but leave the
# value alone
policy_next
,
null
=
self
.
bellmanOperator
()
policy_next
,
null
=
self
.
_
bellmanOperator
()
del
null
n_different
=
(
policy_next
!=
self
.
policy
).
sum
()
...
...
@@ -1081,7 +1081,8 @@ class PolicyIterationModified(PolicyIteration):
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
"number greater than zero."
)
# computation of threshold of variation for V for an epsilon-optimal policy
# computation of threshold of variation for V for an epsilon-optimal
# policy
if
self
.
discount
!=
1
:
self
.
thresh
=
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
else
:
...
...
@@ -1107,7 +1108,7 @@ class PolicyIterationModified(PolicyIteration):
while
not
done
:
self
.
iter
=
self
.
iter
+
1
self
.
policy
,
Vnext
=
self
.
bellmanOperator
()
self
.
policy
,
Vnext
=
self
.
_
bellmanOperator
()
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation
=
getSpan
(
Vnext
-
self
.
V
)
...
...
@@ -1152,7 +1153,8 @@ class QLearning(MDP):
discount : discount rate
in ]0; 1[
n_iter : number of iterations to execute (optional).
Default value = 10000; it is an integer greater than the default value.
Default value = 10000; it is an integer greater than the default
value.
Results
-------
...
...
@@ -1210,7 +1212,7 @@ class QLearning(MDP):
raise
ValueError
(
"PyMDPtoolbox: n_iter should be greater than "
"10000."
)
# We don't want to send this to MDP because computePR should not be
# We don't want to send this to MDP because
_
computePR should not be
# run on it
# MDP.__init__(self, transitions, reward, discount, None, n_iter)
check
(
transitions
,
reward
)
...
...
@@ -1364,7 +1366,7 @@ class RelativeValueIteration(MDP):
self
.
iter
=
self
.
iter
+
1
;
self
.
policy
,
Vnext
=
self
.
bellmanOperator
()
self
.
policy
,
Vnext
=
self
.
_
bellmanOperator
()
Vnext
=
Vnext
-
self
.
gain
variation
=
getSpan
(
Vnext
-
self
.
V
)
...
...
@@ -1573,9 +1575,9 @@ class ValueIteration(MDP):
k
=
1
-
h
.
sum
()
Vprev
=
self
.
V
null
,
value
=
self
.
bellmanOperator
()
null
,
value
=
self
.
_
bellmanOperator
()
# p 201, Proposition 6.6.5
max_iter
=
(
log
(
(
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
max_iter
=
(
log
((
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
)
/
getSpan
(
value
-
Vprev
)
)
/
log
(
self
.
discount
*
k
))
#self.V = Vprev
...
...
@@ -1595,7 +1597,7 @@ class ValueIteration(MDP):
Vprev
=
self
.
V
.
copy
()
# Bellman Operator: compute policy and value functions
self
.
policy
,
self
.
V
=
self
.
bellmanOperator
()
self
.
policy
,
self
.
V
=
self
.
_
bellmanOperator
()
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment