Commit 7a9110f8 authored by Steven Cordwell's avatar Steven Cordwell

added class ValueIterationGS

parent 7fa677ca
......@@ -572,7 +572,41 @@ class PolicyIteration(MDP):
self.policy = tuple(array(self.policy).reshape(self.S).tolist())
class PolicyIterationModified(MDP):
"""Resolution of discounted MDP with modified policy iteration algorithm.
"""Resolution of discounted MDP with policy iteration algorithm
Arguments
---------
Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or
a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or
a cell array (1xA), each cell containing a sparse matrix (SxS) or
a 2D array(SxA) possibly sparse
discount = discount rate, in ]0, 1[
policy0(S) = starting policy, optional
max_iter = maximum number of iteration to be done, upper than 0,
optional (default 1000)
eval_type = type of function used to evaluate policy:
0 for mdp_eval_policy_matrix, else mdp_eval_policy_iterative
optional (default 0)
Data Attributes
---------------
V(S) = value function
policy(S) = optimal policy
iter = number of done iterations
cpu_time = used CPU time
Notes
-----
In verbose mode, at each iteration, displays the number
of differents actions between policy n-1 and n
Examples
--------
>>> import mdp
"""
def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10):
......@@ -1019,6 +1053,151 @@ class ValueIteration(MDP):
self.time = time() - self.time
class ValueIterationGS(MDP):
"""Resolution of discounted MDP with value iteration Gauss-Seidel algorithm.
"""
raise NotImplementedError("This class has not been implemented yet.")
"""Resolution of discounted MDP with value iteration Gauss-Seidel algorithm
Arguments
---------
Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or
a cell array (1xA), each cell containing a matrix (SxS) possibly sparse
R(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or
a cell array (1xA), each cell containing a sparse matrix (SxS) or
a 2D array(SxA) possibly sparse
discount = discount rate in ]0; 1]
beware to check conditions of convergence for discount = 1.
epsilon = epsilon-optimal policy search, upper than 0,
optional (default : 0.01)
max_iter = maximum number of iteration to be done, upper than 0,
optional (default : computed)
V0(S) = starting value function, optional (default : zeros(S,1))
Evaluation
----------
policy(S) = epsilon-optimal policy
iter = number of done iterations
cpu_time = used CPU time
Notes
-----
In verbose mode, at each iteration, displays the variation of V
and the condition which stopped iterations: epsilon-optimum policy found
or maximum number of iterations reached.
Examples
--------
"""
def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=10, initial_value=0):
""""""
MDP.__init__(self, discount, max_iter):
# initialization of optional arguments
if (initial_value == 0):
self.value = matrix(zeros((self.S, 1)))
else:
if (initial_value.size != self.S):
raise ValueError("The initial value must be length S")
self.value = matrix(initial_value)
if epsilon <= 0:
raise ValueError("epsilon must be greater than 0")
#if discount == 1
# disp('--------------------------------------------------------')
# disp('MDP Toolbox WARNING: check conditions of convergence.')
# disp('With no discount, convergence is not always assumed.')
# disp('--------------------------------------------------------')
#end;
PR = self.computePR(P,R)
#% initialization of optional arguments
#if nargin < 6; V0 = zeros(S,1); end;
#if nargin < 4; epsilon = 0.01; end;
#% compute a bound for the number of iterations
#if discount ~= 1
# computed_max_iter = mdp_value_iteration_bound_iter(P, R, discount, epsilon, V0);
#end;
#if nargin < 5
# if discount ~= 1
# max_iter = computed_max_iter;
# else
# max_iter = 1000;
# end;
#else
# if discount ~= 1 && max_iter > computed_max_iter
# disp(['MDP Toolbox WARNING: max_iter is bounded by ' num2str(computed_max_iter,'%12.1f') ])
# max_iter = computed_max_iter;
# end;
#end;
#% computation of threshold of variation for V for an epsilon-optimal policy
#if discount ~= 1
# thresh = epsilon * (1-discount)/discount
#else
# thresh = epsilon
self.discount = discount
if (discount < 1):
# compute a bound for the number of iterations
#self.max_iter = self.boundIter(epsilon)
self.max_iter = 5000
# computation of threshold of variation for V for an epsilon-optimal policy
self.thresh = epsilon * (1 - self.discount) / self.discount
else: # discount == 1
# bound for the number of iterations
self.max_iter = max_iter
# threshold of variation for V for an epsilon-optimal policy
self.thresh = epsilon
self.iter = 0
def iterate():
""""""
V = self.initial_value
done = False
if self.verbose:
print(' Iteration V_variation')
self.time = time()
while not done
self.iter = self.iter + 1
Vprev = self.value
for s = range(self.S):
for a = range(self.A):
if iscell(P):
Q(a) = PR(s,a) + discount * P{a}(s,:) * V
else:
Q(a) = PR(s,a) + discount * P(s,:,a) * V
V(s) = max(Q)
variation = self.getSpan(V - Vprev)
if self.verbose:
print(" %s %s" % (self.iter, variation))
if variation < thresh:
done = True
if self.verbose:
print('MDP Toolbox : iterations stopped, epsilon-optimal policy found')
elif self.iter == self.max_iter
done = True
if self.verbose:
print('MDP Toolbox : iterations stopped by maximum number of iteration condition')
for s = range(S):
for a = range(A):
if iscell(P):
Q(a) = PR(s,a) + P{a}(s,:) * discount * V
else:
Q(a) = PR(s,a) + P(s,:,a) * discount * V
[V(s) , policy(s,1)] = max(Q)
self.time = time() - self.time
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment