Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
f3a07274
Commit
f3a07274
authored
Jun 21, 2013
by
Steven Cordwell
Browse files
update and add many comments
parent
b7bfb004
Changes
1
Hide whitespace changes
Inline
Sidebyside
mdp.py
View file @
f3a07274
...
...
@@ 688,7 +688,6 @@ class MDP(object):
elif
discount
is
not
None
:
raise
ValueError
(
"PyMDPtoolbox: the discount must be a positive "
"real number less than or equal to one."
)
# if the max_iter is None then the algorithm is assumed to not use it
# in its computations
if
type
(
max_iter
)
in
(
int
,
float
):
...
...
@@ 699,7 +698,7 @@ class MDP(object):
elif
max_iter
is
not
None
:
raise
ValueError
(
"PyMDPtoolbox: max_iter must be a positive real "
"number greater than zero."
)
# check that epsilon is something sane
if
type
(
epsilon
)
in
(
int
,
float
):
if
epsilon
<=
0
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be greater than "
...
...
@@ 707,50 +706,44 @@ class MDP(object):
elif
epsilon
is
not
None
:
raise
ValueError
(
"PyMDPtoolbox: epsilon must be a positive real "
"number greater than zero."
)
# we run a check on P and R to make sure they are describing an MDP. If
# an exception isn't raised then they are assumed to be correct.
check
(
transitions
,
reward
)
# computePR will assign the variables self.S, self.A, self.P and self.R
self
.
_computePR
(
transitions
,
reward
)
# the verbosity is by default turned off
self
.
verbose
=
False
# Initially the time taken to perform the computations is set to None
self
.
time
=
None
# set the initial iteration count to zero
self
.
iter
=
0
# V should be stored as a vector ie shape of (S,) or (1, S)
self
.
V
=
None
# policy can also be stored as a vector
self
.
policy
=
None
def
_bellmanOperator
(
self
,
V
=
None
):
"""Apply the Bellman operator on the value function.
Updates the value function and the Vprevimproving policy.
Returns

(policy, value) : tuple of new policy and its value
"""
# Apply the Bellman operator on the value function.
# Updates the value function and the Vprevimproving policy.
# Returns: (policy, value), tuple of new policy and its value
#
# If V hasn't been sent into the method, then we assume to be working
# on the objects V attribute
if
V
is
None
:
# this V should be a reference to the data rather than a copy
V
=
self
.
V
else
:
# make sure the user supplied V is of the right shape
try
:
if
V
.
shape
not
in
((
self
.
S
,),
(
1
,
self
.
S
)):
raise
ValueError
(
"bellman: V is not the right shape."
)
except
AttributeError
:
raise
TypeError
(
"bellman: V must be a numpy array or matrix."
)
# Looping through each action the the Qvalue matrix is calculated
Q
=
empty
((
self
.
A
,
self
.
S
))
for
aa
in
range
(
self
.
A
):
Q
[
aa
]
=
self
.
R
[
aa
]
+
self
.
discount
*
self
.
P
[
aa
].
dot
(
V
)
# Get the policy and value, for now it is being returned but...
# Which way is better?
# 1. Return, (policy, value)
return
(
Q
.
argmax
(
axis
=
0
),
Q
.
max
(
axis
=
0
))
...
...
@@ 759,23 +752,21 @@ class MDP(object):
# self.policy = Q.argmax(axis=1)
def
_computePR
(
self
,
P
,
R
):
"""Compute the reward for the system in one state chosing an action.
Arguments

Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA),
each cell containing a matrix (SxS) possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or a cell array
(1xA), each cell containing a sparse matrix (SxS) or a 2D
array(SxA) possibly sparse
Evaluation

PR(SxA) = reward matrix
"""
# Compute the reward for the system in one state chosing an action.
# Arguments
# 
# Let S = number of states, A = number of actions
# P(SxSxA) = transition matrix
# P could be an array with 3 dimensions or a cell array (1xA),
# each cell containing a matrix (SxS) possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or a cell array
# (1xA), each cell containing a sparse matrix (SxS) or a 2D
# array(SxA) possibly sparse
# Evaluation
# 
# PR(SxA) = reward matrix
#
# We assume that P and R define a MDP i,e. assumption is that
# check(P, R) has already been run and doesn't fail.
#
...
...
@@ 819,7 +810,7 @@ class MDP(object):
self
.
R
=
tuple
(
self
.
R
)
def
_iterate
(
self
):
"""
Raise error because child classes should implement this function.
"""
#
Raise error because child classes should implement this function.
raise
NotImplementedError
(
"You should create an _iterate() method."
)
def
setSilent
(
self
):
...
...
@@ 902,14 +893,13 @@ class FiniteHorizon(MDP):
# Set the reward for the final transition to h, if specified.
if
h
is
not
None
:
self
.
V
[:,
N
]
=
h
# Call the iteration method
self
.
_iterate
()
def
_iterate
(
self
):
"""
Run the finite horizon algorithm.
"""
#
Run the finite horizon algorithm.
self
.
time
=
time
()
# loop through each time period
for
n
in
range
(
self
.
N
):
W
,
X
=
self
.
_bellmanOperator
(
self
.
V
[:,
self
.
N

n
])
self
.
V
[:,
self
.
N

n

1
]
=
X
...
...
@@ 917,7 +907,7 @@ class FiniteHorizon(MDP):
if
self
.
verbose
:
print
(
"stage: %s ... policy transpose : %s"
)
%
(
self
.
N

n
,
self
.
policy
[:,
self
.
N

n

1
].
tolist
())
# update time spent running
self
.
time
=
time
()

self
.
time
# After this we could create a tuple of tuples for the values and
# policies.
...
...
@@ 964,7 +954,7 @@ class LP(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
):
"""Initialise a linear programming MDP."""
# import some functions from cvxopt and set them as object methods
try
:
from
cvxopt
import
matrix
,
solvers
self
.
_linprog
=
solvers
.
lp
...
...
@@ 972,21 +962,21 @@ class LP(MDP):
except
ImportError
:
raise
ImportError
(
"The python module cvxopt is required to use "
"linear programming functionality."
)
# we also need diagonal matrices, and using a sparse one may be more
# memory efficient
from
scipy.sparse
import
eye
as
speye
self
.
_speye
=
speye
# initialise the MDP. epsilon and max_iter are not needed
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
None
)
# Set the cvxopt solver to be quiet by default, but ...
# this doesn't do what I want it to do c.f. issue #3
if
not
self
.
verbose
:
solvers
.
options
[
'show_progress'
]
=
False
# Call the iteration method
self
.
_iterate
()
def
_iterate
(
self
):
"""
Run the linear programming algorithm.
"""
#
Run the linear programming algorithm.
self
.
time
=
time
()
# The objective is to resolve : min V / V >= PR + discount*P*V
# The function linprog of the optimisation Toolbox of Mathworks
...
...
@@ 1007,13 +997,12 @@ class LP(MDP):
# Using the glpk option will make this behave more like Octave
# (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the
# default cvxopt solver) then V agrees with the Octave equivalent
# only to 10e8 places.
# only to 10e8 places.
This assumes glpk is installed of course.
self
.
V
=
array
(
self
.
_linprog
(
f
,
M
,

h
,
solver
=
'glpk'
)[
'x'
])
# apply the Bellman operator
self
.
policy
,
self
.
V
=
self
.
_bellmanOperator
()
# update the time spent solving
self
.
time
=
time
()

self
.
time
# store value and policy as tuples
self
.
V
=
tuple
(
self
.
V
.
tolist
())
self
.
policy
=
tuple
(
self
.
policy
.
tolist
())
...
...
@@ 1057,40 +1046,47 @@ class PolicyIteration(MDP):
>>> import mdp
>>> P, R = mdp.exampleRand(5, 3)
>>> pi = mdp.PolicyIteration(P, R, 0.9)
>>> P, R = mdp.exampleForest()
>>> pi = mdp.PolicyIteration(P, R, 0.9)
>>> pi.V
>>> pi.policy
"""
def
__init__
(
self
,
transitions
,
reward
,
discount
,
policy0
=
None
,
max_iter
=
1000
,
eval_type
=
0
):
"""Initialise a policy iteration MDP."""
# Initialise a policy iteration MDP.
#
# Set up the MDP, but don't need to worry about epsilon values
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
)
# Check if the user has supplied an initial policy. If not make one.
if
policy0
==
None
:
#
i
nitialise the policy to the one which maximises the expected
#
I
nitialise the policy to the one which maximises the expected
# immediate reward
self
.
V
=
zeros
(
self
.
S
)
self
.
policy
,
null
=
self
.
_bellmanOperator
()
null
=
zeros
(
self
.
S
)
self
.
policy
,
null
=
self
.
_bellmanOperator
(
null
)
del
null
else
:
# Use the policy that the user supplied
# Make sure it is a numpy array
policy0
=
array
(
policy0
)
# Make sure the policy is the right size and shape
if
not
policy0
.
shape
in
((
self
.
S
,
),
(
self
.
S
,
1
),
(
1
,
self
.
S
)):
raise
ValueError
(
"PyMDPtolbox: policy0 must a vector with "
"length S."
)
# reshape the policy to be a vector
policy0
=
policy0
.
reshape
(
self
.
S
)
# The policy can only contain integers between 1 and S
if
(
mod
(
policy0
,
1
).
any
()
or
(
policy0
<
0
).
any
()
or
(
policy0
>=
self
.
S
).
any
()):
raise
ValueError
(
"PyMDPtoolbox: policy0 must be a vector of "
"integers between 1 and S."
)
else
:
self
.
policy
=
policy0
# set or reset the initial values to zero
# set the initial values to zero
self
.
V
=
zeros
(
self
.
S
)
# Do some setup depending on the evaluation type
if
eval_type
in
(
0
,
"matrix"
):
from
numpy.linalg
import
solve
from
scipy.sparse
import
eye
...
...
@@ 1104,31 +1100,29 @@ class PolicyIteration(MDP):
"evaluation or 1 for iterative evaluation. "
"The strings 'matrix' and 'iterative' can also "
"be used."
)
# Call the iteration method
self
.
_iterate
()
def
_computePpolicyPRpolicy
(
self
):
"""Compute the transition matrix and the reward matrix for a policy.
Arguments

Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA),
each cell containing a matrix (SxS) possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or
a cell array (1xA), each cell containing a sparse matrix (SxS) or
a 2D array(SxA) possibly sparse
policy(S) = a policy
Evaluation

Ppolicy(SxS) = transition matrix for policy
PRpolicy(S) = reward matrix for policy
"""
# Compute the transition matrix and the reward matrix for a policy.
#
# Arguments
# 
# Let S = number of states, A = number of actions
# P(SxSxA) = transition matrix
# P could be an array with 3 dimensions or a cell array (1xA),
# each cell containing a matrix (SxS) possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or
# a cell array (1xA), each cell containing a sparse matrix (SxS) or
# a 2D array(SxA) possibly sparse
# policy(S) = a policy
#
# Evaluation
# 
# Ppolicy(SxS) = transition matrix for policy
# PRpolicy(S) = reward matrix for policy
#
Ppolicy
=
empty
((
self
.
S
,
self
.
S
))
Rpolicy
=
zeros
(
self
.
S
)
for
aa
in
range
(
self
.
A
):
# avoid looping over S
...
...
@@ 1152,37 +1146,36 @@ class PolicyIteration(MDP):
return
(
Ppolicy
,
Rpolicy
)
def
_evalPolicyIterative
(
self
,
V0
=
0
,
epsilon
=
0.0001
,
max_iter
=
10000
):
"""Evaluate a policy using iteration.
Arguments

Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or
a cell array (1xS), each cell containing a matrix possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or
a cell array (1xA), each cell containing a sparse matrix (SxS) or
a 2D array(SxA) possibly sparse
discount = discount rate in ]0; 1[
policy(S) = a policy
V0(S) = starting value function, optional (default : zeros(S,1))
epsilon = epsilonoptimal policy search, upper than 0,
optional (default : 0.0001)
max_iter = maximum number of iteration to be done, upper than 0,
optional (default : 10000)
Evaluation

Vpolicy(S) = value function, associated to a specific policy
Notes

In verbose mode, at each iteration, displays the condition which
stopped iterations: epsilonoptimum value function found or maximum
number of iterations reached.
"""
# Evaluate a policy using iteration.
#
# Arguments
# 
# Let S = number of states, A = number of actions
# P(SxSxA) = transition matrix
# P could be an array with 3 dimensions or
# a cell array (1xS), each cell containing a matrix possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or
# a cell array (1xA), each cell containing a sparse matrix (SxS) or
# a 2D array(SxA) possibly sparse
# discount = discount rate in ]0; 1[
# policy(S) = a policy
# V0(S) = starting value function, optional (default : zeros(S,1))
# epsilon = epsilonoptimal policy search, upper than 0,
# optional (default : 0.0001)
# max_iter = maximum number of iteration to be done, upper than 0,
# optional (default : 10000)
#
# Evaluation
# 
# Vpolicy(S) = value function, associated to a specific policy
#
# Notes
# 
# In verbose mode, at each iteration, displays the condition which
# stopped iterations: epsilonoptimum value function found or maximum
# number of iterations reached.
#
if
(
type
(
V0
)
in
(
int
,
float
))
and
(
V0
==
0
):
policy_V
=
zeros
(
self
.
S
)
else
:
...
...
@@ 1225,62 +1218,60 @@ class PolicyIteration(MDP):
self
.
V
=
policy_V
def
_evalPolicyMatrix
(
self
):
"""Evaluate the value function of the policy using linear equations.
Arguments

Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA),
each cell containing a matrix (SxS) possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or
a cell array (1xA), each cell containing a sparse matrix (SxS) or
a 2D array(SxA) possibly sparse
discount = discount rate in ]0; 1[
policy(S) = a policy
Evaluation

Vpolicy(S) = value function of the policy
"""
# Evaluate the value function of the policy using linear equations.
#
# Arguments
# 
# Let S = number of states, A = number of actions
# P(SxSxA) = transition matrix
# P could be an array with 3 dimensions or a cell array (1xA),
# each cell containing a matrix (SxS) possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or
# a cell array (1xA), each cell containing a sparse matrix (SxS) or
# a 2D array(SxA) possibly sparse
# discount = discount rate in ]0; 1[
# policy(S) = a policy
#
# Evaluation
# 
# Vpolicy(S) = value function of the policy
#
Ppolicy
,
Rpolicy
=
self
.
_computePpolicyPRpolicy
()
# V = PR + gPV => (IgP)V = PR => V = inv(IgP)* PR
self
.
V
=
self
.
_lin_eq
(
(
self
.
_speye
(
self
.
S
,
self
.
S
)

self
.
discount
*
Ppolicy
),
Rpolicy
)
def
_iterate
(
self
):
"""
Run the policy iteration algorithm.
"""
#
Run the policy iteration algorithm.
# If verbose the print a header
if
self
.
verbose
:
print
(
' Iteration Number_of_different_actions'
)
# Set up the while stopping condition and the current time
done
=
False
self
.
time
=
time
()
# loop until a stopping condition is reached
while
not
done
:
self
.
iter
+=
1
# these _evalPolicy* functions will update the classes value
# attribute
if
self
.
eval_type
==
"matrix"
:
self
.
_evalPolicyMatrix
()
elif
self
.
eval_type
==
"iterative"
:
self
.
_evalPolicyIterative
()
# This should update the classes policy attribute but leave the
# value alone
policy_next
,
null
=
self
.
_bellmanOperator
()
del
null
# calculate in how many places does the old policy disagree with
# the new policy
n_different
=
(
policy_next
!=
self
.
policy
).
sum
()
# if verbose then continue printing a table
if
self
.
verbose
:
print
(
' %s %s'
)
%
(
self
.
iter
,
n_different
)
# Once the policy is unchanging of the maximum number of
# of iterations has been reached then stop
if
n_different
==
0
:
done
=
True
if
self
.
verbose
:
...
...
@@ 1293,9 +1284,8 @@ class PolicyIteration(MDP):
"of iteration condition."
)
else
:
self
.
policy
=
policy_next
# update the time to return th computation time
self
.
time
=
time
()

self
.
time
# store value and policy as tuples
self
.
V
=
tuple
(
self
.
V
.
tolist
())
self
.
policy
=
tuple
(
self
.
policy
.
tolist
())
...
...
@@ 1997,7 +1987,7 @@ class ValueIterationGS(ValueIteration):
self
.
_iterate
()
def
_iterate
(
self
):
"""
Run the value iteration GaussSeidel algorithm.
"""
#
Run the value iteration GaussSeidel algorithm.
done
=
False
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment