Commit 6fd9a818 authored by Steven Cordwell's avatar Steven Cordwell
Browse files

tidy up user facing messages

parent df7f9e55
......@@ -66,6 +66,14 @@ from scipy.sparse import csr_matrix as sparse
from .utils import check, getSpan
MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
"condition."
MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
"policy found."
MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
"function found."
MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
class MDP(object):
"""A Markov Decision Problem.
......@@ -158,7 +166,7 @@ class MDP(object):
assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
if self.discount == 1:
print("WARNING: check conditions of convergence. With no "
"discount, convergence is can not be assumed.")
"discount, convergence can not be assumed.")
# if the max_iter is None then the algorithm is assumed to not use it
# in its computations
if max_iter is not None:
......@@ -350,7 +358,7 @@ class FiniteHorizon(MDP):
def __init__(self, transitions, reward, discount, N, h=None):
# Initialise a finite horizon MDP.
self.N = int(N)
assert self.N > 0, 'PyMDPtoolbox: N must be greater than 0.'
assert self.N > 0, "N must be greater than 0."
# Initialise the base class
MDP.__init__(self, transitions, reward, discount, None, None)
# remove the iteration counter, it is not meaningful for backwards
......@@ -373,23 +381,19 @@ class FiniteHorizon(MDP):
# loop through each time period
for n in range(self.N):
W, X = self._bellmanOperator(self.V[:, self.N - n])
self.V[:, self.N - n - 1] = X
self.policy[:, self.N - n - 1] = W
stage = self.N - n - 1
self.V[:, stage] = X
self.policy[:, stage] = W
if self.verbose:
print(("stage: %s ... policy transpose : %s") % (
self.N - n, self.policy[:, self.N - n -1].tolist()))
print(("stage: %s, policy: %s") % (
stage, self.policy[:, stage].tolist()))
# update time spent running
self.time = time() - self.time
# After this we could create a tuple of tuples for the values and
# policies.
#V = []
#p = []
#for n in xrange(self.N):
# V.append()
# p.append()
#V.append()
#self.V = tuple(V)
#self.policy = tuple(p)
#self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
#self.policy = tuple(tuple(self.policy[:, n].tolist())
# for n in range(self.N))
class LP(MDP):
......@@ -420,10 +424,6 @@ class LP(MDP):
time : float
used CPU time
Notes
-----
In verbose mode, displays the current stage and policy transpose.
Examples
--------
>>> import mdptoolbox, mdptoolbox.example
......@@ -585,10 +585,9 @@ class PolicyIteration(MDP):
elif eval_type in (1, "iterative"):
self.eval_type = "iterative"
else:
raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix "
"evaluation or 1 for iterative evaluation. "
"The strings 'matrix' and 'iterative' can also "
"be used.")
raise ValueError("'eval_type' should be '0' for matrix evaluation "
"or '1' for iterative evaluation. The strings "
"'matrix' and 'iterative' can also be used.")
# Call the iteration method
#self.run()
......@@ -673,15 +672,15 @@ class PolicyIteration(MDP):
"'V0' must be a vector of length S."
policy_V = array(V0).reshape(self.S)
except AttributeError:
if len(V0) == self.S:
policy_V = array(V0).reshape(self.S)
else:
if V0 == 0:
policy_V = zeros(self.S)
else:
policy_V = array(V0).reshape(self.S)
policy_P, policy_R = self._computePpolicyPRpolicy()
if self.verbose:
print(' Iteration V_variation')
print(' Iteration\t\t V variation')
itr = 0
done = False
......@@ -693,19 +692,17 @@ class PolicyIteration(MDP):
variation = absolute(policy_V - Vprev).max()
if self.verbose:
print((' %s %s') % (itr, variation))
print((' %s\t\t %s') % (itr, variation))
# ensure |Vn - Vpolicy| < epsilon
if variation < ((1 - self.discount) / self.discount) * epsilon:
done = True
if self.verbose:
print("PyMDPtoolbox: iterations stopped, epsilon-optimal "
"value function.")
print(MSG_STOP_EPSILON_OPTIMAL_VALUE)
elif itr == max_iter:
done = True
if self.verbose:
print("PyMDPtoolbox: iterations stopped by maximum number "
"of iteration condition.")
print(MSG_STOP_MAX_ITER)
self.V = policy_V
......@@ -738,7 +735,7 @@ class PolicyIteration(MDP):
# Run the policy iteration algorithm.
# If verbose the print a header
if self.verbose:
print(' Iteration Number_of_different_actions')
print(' Iteration\t\tNumber of different actions')
# Set up the while stopping condition and the current time
done = False
self.time = time()
......@@ -760,20 +757,17 @@ class PolicyIteration(MDP):
n_different = (policy_next != self.policy).sum()
# if verbose then continue printing a table
if self.verbose:
print((' %s %s') % (self.iter,
n_different))
print((' %s\t\t %s') % (self.iter, n_different))
# Once the policy is unchanging of the maximum number of
# of iterations has been reached then stop
if n_different == 0:
done = True
if self.verbose:
print("PyMDPtoolbox: iterations stopped, unchanging "
"policy found.")
print(MSG_STOP_UNCHANGING_POLICY)
elif (self.iter == self.max_iter):
done = True
if self.verbose:
print("PyMDPtoolbox: iterations stopped by maximum number "
"of iteration condition.")
print(MSG_STOP_MAX_ITER)
else:
self.policy = policy_next
# update the time to return th computation time
......@@ -865,7 +859,7 @@ class PolicyIterationModified(PolicyIteration):
# Run the modified policy iteration algorithm.
if self.verbose:
print('\tIteration\tV-variation')
print(' \tIteration\t\tV-variation')
self.time = time()
......@@ -878,7 +872,7 @@ class PolicyIterationModified(PolicyIteration):
variation = getSpan(Vnext - self.V)
if self.verbose:
print(("\t%s\t%s" % (self.iter, variation)))
print((" %s\t\t %s" % (self.iter, variation)))
self.V = Vnext
if variation < self.thresh:
......@@ -973,8 +967,7 @@ class QLearning(MDP):
# The following check won't be done in MDP()'s initialisation, so let's
# do it here
self.max_iter = int(n_iter)
assert self.max_iter >= 10000, "PyMDPtoolbox: n_iter should be " \
"greater than 10000."
assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."
# We don't want to send this to MDP because _computePR should not be
# run on it, so check that it defines an MDP
......@@ -1029,7 +1022,10 @@ class QLearning(MDP):
try:
r = self.R[a][s, s_new]
except IndexError:
r = self.R[s, a]
try:
r = self.R[s, a]
except IndexError:
r = self.R[s]
# Updating the value of Q
# Decaying update coefficient (1/sqrt(n+2)) can be changed
......@@ -1143,7 +1139,7 @@ class RelativeValueIteration(MDP):
done = False
if self.verbose:
print(' Iteration U_variation')
print(' Iteration\t\tU variation')
self.time = time()
......@@ -1157,20 +1153,18 @@ class RelativeValueIteration(MDP):
variation = getSpan(Vnext - self.V)
if self.verbose:
print((" %s %s" % (self.iter, variation)))
print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.epsilon:
done = True
self.average_reward = self.gain + (Vnext - self.V).min()
if self.verbose:
print("MDP Toolbox : iterations stopped, epsilon-optimal "
"policy found.")
print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
elif self.iter == self.max_iter:
done = True
self.average_reward = self.gain + (Vnext - self.V).min()
if self.verbose:
print("MDP Toolbox : iterations stopped by maximum "
"number of iteration condition.")
print(MSG_STOP_MAX_ITER)
self.V = Vnext
self.gain = float(self.V[self.S - 1])
......@@ -1270,33 +1264,33 @@ class ValueIteration(MDP):
>>> vi.setVerbose()
>>> vi.run()
Iteration V-variation
1 8.0
2 2.76
3 1.9872
4 1.430784
5 1.03016448
6 0.7417184256
7 0.534037266432
8 0.384506831831
9 0.276844918918
10 0.199328341621
11 0.143516405967
12 0.103331812296
13 0.0743989048534
14 0.0535672114945
15 0.038568392276
16 0.0277692424387
17 0.0199938545559
18 0.0143955752802
19 0.0103648142018
20 0.00746266622526
21 0.00537311968218
22 0.00386864617116
23 0.00278542524322
24 0.00200550617512
25 0.00144396444609
26 0.0010396544012
PyMDPToolbox: iteration stopped, epsilon-optimal policy found.
1 8.0
2 2.76
3 1.9872
4 1.430784
5 1.03016448
6 0.7417184256
7 0.534037266432
8 0.384506831831
9 0.276844918918
10 0.199328341621
11 0.143516405967
12 0.103331812296
13 0.0743989048534
14 0.0535672114945
15 0.038568392276
16 0.0277692424387
17 0.0199938545559
18 0.0143955752802
19 0.0103648142018
20 0.00746266622526
21 0.00537311968218
22 0.00386864617116
23 0.00278542524322
24 0.00200550617512
25 0.00144396444609
26 0.0010396544012
Iterating stopped, epsilon-optimal policy found.
>>> vi.V
(40.048625392716815, 33.65371175967546)
>>> vi.policy
......@@ -1396,7 +1390,7 @@ class ValueIteration(MDP):
# Run the value iteration algorithm.
if self.verbose:
print('\tIteration\tV-variation')
print(' Iteration\t\tV-variation')
self.time = time()
while True:
......@@ -1413,16 +1407,15 @@ class ValueIteration(MDP):
variation = getSpan(self.V - Vprev)
if self.verbose:
print(("\t%s\t%s" % (self.iter, variation)))
print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.thresh:
if self.verbose:
print("Iteration stopped, epsilon-optimal policy found.")
print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
break
elif (self.iter == self.max_iter):
if self.verbose:
print("Iteration stopped by maximum number of iterations "
"condition.")
print(MSG_STOP_MAX_ITER)
break
# store value and policy as tuples
......@@ -1524,7 +1517,7 @@ class ValueIterationGS(ValueIteration):
done = False
if self.verbose:
print(' Iteration V_variation')
print(' Iteration\t\tV-variation')
self.time = time()
......@@ -1543,18 +1536,16 @@ class ValueIterationGS(ValueIteration):
variation = getSpan(self.V - Vprev)
if self.verbose:
print((" %s %s" % (self.iter, variation)))
print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.thresh:
done = True
if self.verbose:
print("Iterations stopped, epsilon-optimal policy found.")
print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
elif self.iter == self.max_iter:
done = True
if self.verbose:
print("Iterations stopped by maximum number of iteration "
"condition.")
print(MSG_STOP_MAX_ITER)
self.policy = []
for s in range(self.S):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment