Commit 5f7f035b authored by Steven Cordwell's avatar Steven Cordwell

fixes and unittest fixes for ValueIteration and PolicyIteration classes

parent fa69c0d3
......@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from numpy import absolute, array, diag, matrix, mean, mod, multiply, ndarray
from numpy import ones, zeros
from numpy import nonzero, ones, zeros
from numpy.random import rand
from math import ceil, log, sqrt
from random import randint, random
......@@ -419,6 +419,13 @@ def exampleRand(S, A, is_sparse=False, mask=None):
return (P, R)
def getSpan(self, W):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return (W.max() - W.min())
class MDP(object):
"""The Markov Decision Problem Toolbox."""
......@@ -502,27 +509,31 @@ class MDP(object):
Ppolicy(SxS) = transition matrix for policy
PRpolicy(S) = reward matrix for policy
"""
Ppolicy = zeros(())
PRpolicy = zeros(())
for a in range(self.A): # avoid looping over S
ind = find(self.policy == a) # the rows that use action a
if not isempty(ind):
if iscell(P):
Ppolicy[ind,:] = self.P[a][ind,:]
else:
Ppolicy[ind,:] = self.P[ind,:,a]
Ppolicy = matrix(zeros((self.S, self.S)))
Rpolicy = matrix(zeros((self.S, 1)))
for aa in range(self.A): # avoid looping over S
# the rows that use action a. .getA1() is used to make sure that
# ind is a 1 dimensional vector
ind = nonzero(self.policy == aa)[0].getA1()
if ind.size > 0: # if no rows use action a, then no point continuing
Ppolicy[ind, :] = self.P[aa][ind, :]
PR = self.computePR()
PRpolicy[ind, 1] = PR[ind, a]
# self.R cannot be sparse with the code in its current condition
#PR = self.computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy[ind] = self.R[ind, aa]
# self.R cannot be sparse with the code in its current condition, but
# it should be possible in the future. Also, if R is so big that its
# a good idea to use a sparse matrix for it, then converting PRpolicy
# from a dense to sparse matrix doesn't seem very memory efficient
if type(self.R) is sparse:
PRpolicy = sparse(PRpolicy)
Rpolicy = sparse(Rpolicy)
#self.Ppolicy = Ppolicy
#self.Rpolicy = PRpolicy
return (Ppolicy, PRpolicy)
#self.Rpolicy = Rpolicy
return (Ppolicy, Rpolicy)
def computePR(self, P, R):
"""Computes the reward for the system in one state chosing an action
......@@ -584,13 +595,6 @@ class MDP(object):
"""
raise NotImplementedError("You should create an iterate() method.")
def getSpan(self, W):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return (W.max() - W.min())
def setSilent(self):
"""Ask for running resolution functions of the MDP Toolbox in silent
mode.
......@@ -804,6 +808,8 @@ class PolicyIteration(MDP):
if eval_type in (0, "matrix"):
from numpy.linalg import solve
from scipy.sparse import eye
self.speye = eye
self.lin_eq = solve
self.eval_type = "matrix"
elif eval_type in (1, "iterative"):
......@@ -844,11 +850,11 @@ class PolicyIteration(MDP):
epsilon-optimum value function found or maximum number of iterations reached.
"""
if V0 == 0:
Vpolicy = zeros(self.S, 1)
policy_V = zeros((self.S, 1))
else:
raise NotImplementedError("evalPolicyIterative: case V0 != 0 not implemented. Use V0=0 instead.")
Ppolicy, PRpolicy = self.computePpolicyPRpolicy(self.P, self.R, self.policy)
policy_P, policy_R = self.computePpolicyPRpolicy()
if self.verbose:
print(' Iteration V_variation')
......@@ -857,21 +863,24 @@ class PolicyIteration(MDP):
done = False
while not done:
itr = itr + 1
Vprev = Vpolicy
Vpolicy = PRpolicy + self.discount * Ppolicy * Vprev
variation = max(abs(Vpolicy - Vprev))
Vprev = policy_V
policy_V = policy_R + self.discount * policy_P * Vprev
variation = absolute(policy_V - Vprev).max()
if self.verbose:
print(' %s %s') % (itr, variation)
if variation < ((1 - self.discount) / self.discount) * epsilon: # to ensure |Vn - Vpolicy| < epsilon
done = True
if self.verbose:
print('MDP Toolbox: iterations stopped, epsilon-optimal value function')
print('PyMDPtoolbox: iterations stopped, epsilon-optimal value function')
elif itr == max_iter:
done = True
if self.verbose:
print('MDP Toolbox: iterations stopped by maximum number of iteration condition')
print('PyMDPtoolbox: iterations stopped by maximum number of iteration condition')
self.value = Vpolicy
self.value = policy_V
def evalPolicyMatrix(self):
"""Evaluation of the value function of a policy
......@@ -894,12 +903,12 @@ class PolicyIteration(MDP):
Vpolicy(S) = value function of the policy
"""
Ppolicy, PRpolicy = self.computePpolicyPRpolicy(self.P, self.R, self.policy)
Ppolicy, Rpolicy = self.computePpolicyPRpolicy()
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self.value = self.lin_eq((speye(self.S, self.S) - self.discount * Ppolicy) , PRpolicy)
self.value = self.lin_eq((self.speye(self.S, self.S) - self.discount * Ppolicy) , Rpolicy)
def iterate(self):
""""""
"""Run the policy iteration algorithm."""
if self.verbose:
print(' Iteration Number_of_different_actions')
......@@ -927,8 +936,16 @@ class PolicyIteration(MDP):
if self.verbose:
print(' %s %s') % (self.iter, n_different)
if (policy_next == self.policy).all() or (self.iter == self.max_iter):
if n_different == 0:
done = True
if self.verbose:
print("...iterations stopped, unchanging policy found")
elif (self.iter == self.max_iter):
done = True
if self.verbose:
print("...iterations stopped by maximum number of iteration condition")
else:
self.policy = policy_next
self.time = time() - self.time
......@@ -1009,7 +1026,7 @@ class PolicyIterationModified(MDP):
Vnext, policy = self.bellmanOperator(self.P, self.PR, self.discount, self.V)
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation = self.getSpan(Vnext - self.value);
variation = getSpan(Vnext - self.value);
if self.verbose:
print(" %s %s" % (self.iter, variation))
......@@ -1237,7 +1254,7 @@ class RelativeValueIteration(MDP):
Unext, policy = self.bellmanOperator(self.P, self.R, 1, self.U)
Unext = Unext - self.gain
variation = self.getSpan(Unext - self.U)
variation = getSpan(Unext - self.U)
if self.verbose:
print(" %s %s" % (self.iter, variation))
......@@ -1382,8 +1399,8 @@ class ValueIteration(MDP):
if (initial_value == 0):
self.value = matrix(zeros((self.S, 1)))
else:
if (initial_value.size != self.S):
raise ValueError("The initial value must be length S")
if (not initial_value.shape in ((self.S, ), (self.S, 1), (1, self.S))):
raise ValueError("The initial value must be a vector of length S")
else:
self.value = matrix(initial_value)
......@@ -1431,10 +1448,10 @@ class ValueIteration(MDP):
k = 1 - h.sum()
Vprev = self.value
self.bellmanOperator()
null, value = self.bellmanOperator()
# p 201, Proposition 6.6.5
max_iter = log( (epsilon * (1 - self.discount) / self.discount) / self.getSpan(self.value - Vprev) ) / log(self.discount * k)
self.value = Vprev
max_iter = log( (epsilon * (1 - self.discount) / self.discount) / getSpan(value - Vprev) ) / log(self.discount * k)
#self.value = Vprev
self.max_iter = ceil(max_iter)
......@@ -1458,7 +1475,7 @@ class ValueIteration(MDP):
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# finds the maximum of the the rows. (Operates along the columns?)
variation = self.getSpan(self.value - Vprev)
variation = getSpan(self.value - Vprev)
if self.verbose:
print(" %s %s" % (self.iter, variation))
......@@ -1566,7 +1583,7 @@ class ValueIterationGS(ValueIteration):
Q[a] = self.R[s,a] + self.discount * self.P[a][s,:] * self.value
self.value[s] = max(Q)
variation = self.getSpan(V - Vprev)
variation = getSpan(V - Vprev)
if self.verbose:
print(" %s %s" % (self.iter, variation))
......
......@@ -172,10 +172,10 @@ R = array([[5, 10], [-1, 2]])
# MDP
def test_MDP_P_R_1():
P1 = zeros((ACTIONS, ), dtype=object)
P1[0] = matrix([[0.5, 0.5],[0.8, 0.2]])
P1[1] = matrix([[0, 1],[0.1, 0.9]])
R1 = matrix([[5, 10], [-1, 2]])
P1 = zeros((2, ), dtype=object)
P1[0] = matrix('0.5 0.5; 0.8 0.2')
P1[1] = matrix('0 1; 0.1 0.9')
R1 = matrix('5 10; -1 2')
a = MDP(P, R, 0.9, 0.01)
assert a.P.dtype == P1.dtype
assert a.R.dtype == R1.dtype
......@@ -185,10 +185,10 @@ def test_MDP_P_R_1():
def test_MDP_P_R_2():
R = array([[[5, 10], [-1, 2]], [[1, 2], [3, 4]]])
P1 = zeros((ACTIONS, ), dtype=object)
P1[0] = matrix([[0.5, 0.5],[0.8, 0.2]])
P1[1] = matrix([[0, 1],[0.1, 0.9]])
R1 = matrix([[7.5, 2], [-0.4, 3.9]])
P1 = zeros((2, ), dtype=object)
P1[0] = matrix('0.5 0.5; 0.8 0.2')
P1[1] = matrix('0 1; 0.1 0.9')
R1 = matrix('7.5 2; -0.4 3.9')
a = MDP(P, R, 0.9, 0.01)
assert type(a.P) == type(P1)
assert type(a.R) == type(R1)
......@@ -201,7 +201,7 @@ def test_MDP_P_R_2():
def test_MDP_P_R_3():
P = array([[[0.6116, 0.3884],[0, 1]],[[0.6674, 0.3326],[0, 1]]])
R = array([[[-0.2433, 0.7073],[0, 0.1871]],[[-0.0069, 0.6433],[0, 0.2898]]])
PR = matrix([[0.12591304, 0.20935652], [0.1871, 0.2898]])
PR = matrix('0.12591304 0.20935652; 0.1871 0.2898')
a = MDP(P, R, 0.9, 0.01)
assert (absolute(a.R - PR) < SMALLNUM).all()
......@@ -229,16 +229,73 @@ def test_ValueIteration_exampleForest():
def test_PolicyIteration_init_policy0():
a = PolicyIteration(P, R, 0.9)
p = array((1, 1)).reshape(2, 1)
assert (absolute(a.policy - p) < SMALLNUM).all()
def test_PolicyIteration():
PolicyIteration(P, R, 0.9)
#inst.iterate()
#assert (abs(inst.value[0] - 42.4419) < 0.001)
#assert (abs(inst.value[1] - 36.0465) < 0.001)
#assert (inst.policy == (1, 0))
#assert (inst.iter == 2)
p = matrix('1; 1')
assert (a.policy == p).all()
def test_PolicyIteration_init_policy0_exampleForest():
P, R = exampleForest()
a = PolicyIteration(P, R, 0.9)
p = matrix('0; 1; 0')
assert (a.policy == p).all()
def test_PolicyIteration_computePpolicyPRpolicy_exampleForest():
P, R = exampleForest()
a = PolicyIteration(P, R, 0.9)
P1 = matrix('0.1 0.9 0; 1 0 0; 0.1 0 0.9')
R1 = matrix('0; 1; 4')
Ppolicy, Rpolicy = a.computePpolicyPRpolicy()
assert (absolute(Ppolicy - P1) < SMALLNUM).all()
assert (absolute(Rpolicy - R1) < SMALLNUM).all()
def test_PolicyIteration_evalPolicyIterative_exampleForest():
P, R = exampleForest()
v0 = matrix('0; 0; 0')
v1 = matrix('4.47504640074458; 5.02753258879703; 23.17234211944304')
p = matrix('0; 1; 0')
a = PolicyIteration(P, R, 0.9)
assert (absolute(a.value - v0) < SMALLNUM).all()
a.evalPolicyIterative()
assert (absolute(a.value - v1) < SMALLNUM).all()
assert (a.policy == p).all()
def test_PolicyIteration_evalPolicyIterative_bellmanOperator_exampleForest():
P, R = exampleForest()
v = matrix('4.47504640074458; 5.02753258879703; 23.17234211944304')
p = matrix('0; 0; 0')
a = PolicyIteration(P, R, 0.9)
a.evalPolicyIterative()
policy, value = a.bellmanOperator()
assert (policy == p).all()
assert (absolute(a.value - v) < SMALLNUM).all()
def test_PolicyIteration_iterative_exampleForest():
P, R = exampleForest()
a = PolicyIteration(P, R, 0.9, eval_type=1)
V = matrix('26.2439058351861 29.4839058351861 33.4839058351861')
p = matrix('0 0 0')
itr = 2
a.iterate()
assert (absolute(array(a.value) - V) < SMALLNUM).all()
assert (array(a.policy) == p).all()
assert a.iter == itr
def test_PolicyIteration_evalPolicyMatrix_exampleForest():
P, R = exampleForest()
v_pol = matrix('4.47513812154696; 5.02762430939227; 23.17243384704857')
a = PolicyIteration(P, R, 0.9)
a.evalPolicyMatrix()
assert (absolute(a.value - v_pol) < SMALLNUM).all()
def test_PolicyIteration_matrix_exampleForest():
P, R = exampleForest()
a = PolicyIteration(P, R, 0.9)
V = matrix('26.2440000000000 29.4840000000000 33.4840000000000')
p = matrix('0 0 0')
itr = 2
a.iterate()
assert (absolute(array(a.value) - V) < SMALLNUM).all()
assert (array(a.policy) == p).all()
assert a.iter == itr
#def test_JacksCarRental():
# S = 21 ** 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment