Commit 6fd9a818 authored by Steven Cordwell's avatar Steven Cordwell
Browse files

tidy up user facing messages

parent df7f9e55
...@@ -66,6 +66,14 @@ from scipy.sparse import csr_matrix as sparse ...@@ -66,6 +66,14 @@ from scipy.sparse import csr_matrix as sparse
from .utils import check, getSpan from .utils import check, getSpan
MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
"condition."
MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
"policy found."
MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
"function found."
MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
class MDP(object): class MDP(object):
"""A Markov Decision Problem. """A Markov Decision Problem.
...@@ -158,7 +166,7 @@ class MDP(object): ...@@ -158,7 +166,7 @@ class MDP(object):
assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]" assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
if self.discount == 1: if self.discount == 1:
print("WARNING: check conditions of convergence. With no " print("WARNING: check conditions of convergence. With no "
"discount, convergence is can not be assumed.") "discount, convergence can not be assumed.")
# if the max_iter is None then the algorithm is assumed to not use it # if the max_iter is None then the algorithm is assumed to not use it
# in its computations # in its computations
if max_iter is not None: if max_iter is not None:
...@@ -350,7 +358,7 @@ class FiniteHorizon(MDP): ...@@ -350,7 +358,7 @@ class FiniteHorizon(MDP):
def __init__(self, transitions, reward, discount, N, h=None): def __init__(self, transitions, reward, discount, N, h=None):
# Initialise a finite horizon MDP. # Initialise a finite horizon MDP.
self.N = int(N) self.N = int(N)
assert self.N > 0, 'PyMDPtoolbox: N must be greater than 0.' assert self.N > 0, "N must be greater than 0."
# Initialise the base class # Initialise the base class
MDP.__init__(self, transitions, reward, discount, None, None) MDP.__init__(self, transitions, reward, discount, None, None)
# remove the iteration counter, it is not meaningful for backwards # remove the iteration counter, it is not meaningful for backwards
...@@ -373,23 +381,19 @@ class FiniteHorizon(MDP): ...@@ -373,23 +381,19 @@ class FiniteHorizon(MDP):
# loop through each time period # loop through each time period
for n in range(self.N): for n in range(self.N):
W, X = self._bellmanOperator(self.V[:, self.N - n]) W, X = self._bellmanOperator(self.V[:, self.N - n])
self.V[:, self.N - n - 1] = X stage = self.N - n - 1
self.policy[:, self.N - n - 1] = W self.V[:, stage] = X
self.policy[:, stage] = W
if self.verbose: if self.verbose:
print(("stage: %s ... policy transpose : %s") % ( print(("stage: %s, policy: %s") % (
self.N - n, self.policy[:, self.N - n -1].tolist())) stage, self.policy[:, stage].tolist()))
# update time spent running # update time spent running
self.time = time() - self.time self.time = time() - self.time
# After this we could create a tuple of tuples for the values and # After this we could create a tuple of tuples for the values and
# policies. # policies.
#V = [] #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
#p = [] #self.policy = tuple(tuple(self.policy[:, n].tolist())
#for n in xrange(self.N): # for n in range(self.N))
# V.append()
# p.append()
#V.append()
#self.V = tuple(V)
#self.policy = tuple(p)
class LP(MDP): class LP(MDP):
...@@ -420,10 +424,6 @@ class LP(MDP): ...@@ -420,10 +424,6 @@ class LP(MDP):
time : float time : float
used CPU time used CPU time
Notes
-----
In verbose mode, displays the current stage and policy transpose.
Examples Examples
-------- --------
>>> import mdptoolbox, mdptoolbox.example >>> import mdptoolbox, mdptoolbox.example
...@@ -585,10 +585,9 @@ class PolicyIteration(MDP): ...@@ -585,10 +585,9 @@ class PolicyIteration(MDP):
elif eval_type in (1, "iterative"): elif eval_type in (1, "iterative"):
self.eval_type = "iterative" self.eval_type = "iterative"
else: else:
raise ValueError("PyMDPtoolbox: eval_type should be 0 for matrix " raise ValueError("'eval_type' should be '0' for matrix evaluation "
"evaluation or 1 for iterative evaluation. " "or '1' for iterative evaluation. The strings "
"The strings 'matrix' and 'iterative' can also " "'matrix' and 'iterative' can also be used.")
"be used.")
# Call the iteration method # Call the iteration method
#self.run() #self.run()
...@@ -673,15 +672,15 @@ class PolicyIteration(MDP): ...@@ -673,15 +672,15 @@ class PolicyIteration(MDP):
"'V0' must be a vector of length S." "'V0' must be a vector of length S."
policy_V = array(V0).reshape(self.S) policy_V = array(V0).reshape(self.S)
except AttributeError: except AttributeError:
if len(V0) == self.S: if V0 == 0:
policy_V = array(V0).reshape(self.S)
else:
policy_V = zeros(self.S) policy_V = zeros(self.S)
else:
policy_V = array(V0).reshape(self.S)
policy_P, policy_R = self._computePpolicyPRpolicy() policy_P, policy_R = self._computePpolicyPRpolicy()
if self.verbose: if self.verbose:
print(' Iteration V_variation') print(' Iteration\t\t V variation')
itr = 0 itr = 0
done = False done = False
...@@ -693,19 +692,17 @@ class PolicyIteration(MDP): ...@@ -693,19 +692,17 @@ class PolicyIteration(MDP):
variation = absolute(policy_V - Vprev).max() variation = absolute(policy_V - Vprev).max()
if self.verbose: if self.verbose:
print((' %s %s') % (itr, variation)) print((' %s\t\t %s') % (itr, variation))
# ensure |Vn - Vpolicy| < epsilon # ensure |Vn - Vpolicy| < epsilon
if variation < ((1 - self.discount) / self.discount) * epsilon: if variation < ((1 - self.discount) / self.discount) * epsilon:
done = True done = True
if self.verbose: if self.verbose:
print("PyMDPtoolbox: iterations stopped, epsilon-optimal " print(MSG_STOP_EPSILON_OPTIMAL_VALUE)
"value function.")
elif itr == max_iter: elif itr == max_iter:
done = True done = True
if self.verbose: if self.verbose:
print("PyMDPtoolbox: iterations stopped by maximum number " print(MSG_STOP_MAX_ITER)
"of iteration condition.")
self.V = policy_V self.V = policy_V
...@@ -738,7 +735,7 @@ class PolicyIteration(MDP): ...@@ -738,7 +735,7 @@ class PolicyIteration(MDP):
# Run the policy iteration algorithm. # Run the policy iteration algorithm.
# If verbose the print a header # If verbose the print a header
if self.verbose: if self.verbose:
print(' Iteration Number_of_different_actions') print(' Iteration\t\tNumber of different actions')
# Set up the while stopping condition and the current time # Set up the while stopping condition and the current time
done = False done = False
self.time = time() self.time = time()
...@@ -760,20 +757,17 @@ class PolicyIteration(MDP): ...@@ -760,20 +757,17 @@ class PolicyIteration(MDP):
n_different = (policy_next != self.policy).sum() n_different = (policy_next != self.policy).sum()
# if verbose then continue printing a table # if verbose then continue printing a table
if self.verbose: if self.verbose:
print((' %s %s') % (self.iter, print((' %s\t\t %s') % (self.iter, n_different))
n_different))
# Once the policy is unchanging of the maximum number of # Once the policy is unchanging of the maximum number of
# of iterations has been reached then stop # of iterations has been reached then stop
if n_different == 0: if n_different == 0:
done = True done = True
if self.verbose: if self.verbose:
print("PyMDPtoolbox: iterations stopped, unchanging " print(MSG_STOP_UNCHANGING_POLICY)
"policy found.")
elif (self.iter == self.max_iter): elif (self.iter == self.max_iter):
done = True done = True
if self.verbose: if self.verbose:
print("PyMDPtoolbox: iterations stopped by maximum number " print(MSG_STOP_MAX_ITER)
"of iteration condition.")
else: else:
self.policy = policy_next self.policy = policy_next
# update the time to return th computation time # update the time to return th computation time
...@@ -865,7 +859,7 @@ class PolicyIterationModified(PolicyIteration): ...@@ -865,7 +859,7 @@ class PolicyIterationModified(PolicyIteration):
# Run the modified policy iteration algorithm. # Run the modified policy iteration algorithm.
if self.verbose: if self.verbose:
print('\tIteration\tV-variation') print(' \tIteration\t\tV-variation')
self.time = time() self.time = time()
...@@ -878,7 +872,7 @@ class PolicyIterationModified(PolicyIteration): ...@@ -878,7 +872,7 @@ class PolicyIterationModified(PolicyIteration):
variation = getSpan(Vnext - self.V) variation = getSpan(Vnext - self.V)
if self.verbose: if self.verbose:
print(("\t%s\t%s" % (self.iter, variation))) print((" %s\t\t %s" % (self.iter, variation)))
self.V = Vnext self.V = Vnext
if variation < self.thresh: if variation < self.thresh:
...@@ -973,8 +967,7 @@ class QLearning(MDP): ...@@ -973,8 +967,7 @@ class QLearning(MDP):
# The following check won't be done in MDP()'s initialisation, so let's # The following check won't be done in MDP()'s initialisation, so let's
# do it here # do it here
self.max_iter = int(n_iter) self.max_iter = int(n_iter)
assert self.max_iter >= 10000, "PyMDPtoolbox: n_iter should be " \ assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."
"greater than 10000."
# We don't want to send this to MDP because _computePR should not be # We don't want to send this to MDP because _computePR should not be
# run on it, so check that it defines an MDP # run on it, so check that it defines an MDP
...@@ -1029,7 +1022,10 @@ class QLearning(MDP): ...@@ -1029,7 +1022,10 @@ class QLearning(MDP):
try: try:
r = self.R[a][s, s_new] r = self.R[a][s, s_new]
except IndexError: except IndexError:
r = self.R[s, a] try:
r = self.R[s, a]
except IndexError:
r = self.R[s]
# Updating the value of Q # Updating the value of Q
# Decaying update coefficient (1/sqrt(n+2)) can be changed # Decaying update coefficient (1/sqrt(n+2)) can be changed
...@@ -1143,7 +1139,7 @@ class RelativeValueIteration(MDP): ...@@ -1143,7 +1139,7 @@ class RelativeValueIteration(MDP):
done = False done = False
if self.verbose: if self.verbose:
print(' Iteration U_variation') print(' Iteration\t\tU variation')
self.time = time() self.time = time()
...@@ -1157,20 +1153,18 @@ class RelativeValueIteration(MDP): ...@@ -1157,20 +1153,18 @@ class RelativeValueIteration(MDP):
variation = getSpan(Vnext - self.V) variation = getSpan(Vnext - self.V)
if self.verbose: if self.verbose:
print((" %s %s" % (self.iter, variation))) print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.epsilon: if variation < self.epsilon:
done = True done = True
self.average_reward = self.gain + (Vnext - self.V).min() self.average_reward = self.gain + (Vnext - self.V).min()
if self.verbose: if self.verbose:
print("MDP Toolbox : iterations stopped, epsilon-optimal " print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
"policy found.")
elif self.iter == self.max_iter: elif self.iter == self.max_iter:
done = True done = True
self.average_reward = self.gain + (Vnext - self.V).min() self.average_reward = self.gain + (Vnext - self.V).min()
if self.verbose: if self.verbose:
print("MDP Toolbox : iterations stopped by maximum " print(MSG_STOP_MAX_ITER)
"number of iteration condition.")
self.V = Vnext self.V = Vnext
self.gain = float(self.V[self.S - 1]) self.gain = float(self.V[self.S - 1])
...@@ -1270,33 +1264,33 @@ class ValueIteration(MDP): ...@@ -1270,33 +1264,33 @@ class ValueIteration(MDP):
>>> vi.setVerbose() >>> vi.setVerbose()
>>> vi.run() >>> vi.run()
Iteration V-variation Iteration V-variation
1 8.0 1 8.0
2 2.76 2 2.76
3 1.9872 3 1.9872
4 1.430784 4 1.430784
5 1.03016448 5 1.03016448
6 0.7417184256 6 0.7417184256
7 0.534037266432 7 0.534037266432
8 0.384506831831 8 0.384506831831
9 0.276844918918 9 0.276844918918
10 0.199328341621 10 0.199328341621
11 0.143516405967 11 0.143516405967
12 0.103331812296 12 0.103331812296
13 0.0743989048534 13 0.0743989048534
14 0.0535672114945 14 0.0535672114945
15 0.038568392276 15 0.038568392276
16 0.0277692424387 16 0.0277692424387
17 0.0199938545559 17 0.0199938545559
18 0.0143955752802 18 0.0143955752802
19 0.0103648142018 19 0.0103648142018
20 0.00746266622526 20 0.00746266622526
21 0.00537311968218 21 0.00537311968218
22 0.00386864617116 22 0.00386864617116
23 0.00278542524322 23 0.00278542524322
24 0.00200550617512 24 0.00200550617512
25 0.00144396444609 25 0.00144396444609
26 0.0010396544012 26 0.0010396544012
PyMDPToolbox: iteration stopped, epsilon-optimal policy found. Iterating stopped, epsilon-optimal policy found.
>>> vi.V >>> vi.V
(40.048625392716815, 33.65371175967546) (40.048625392716815, 33.65371175967546)
>>> vi.policy >>> vi.policy
...@@ -1396,7 +1390,7 @@ class ValueIteration(MDP): ...@@ -1396,7 +1390,7 @@ class ValueIteration(MDP):
# Run the value iteration algorithm. # Run the value iteration algorithm.
if self.verbose: if self.verbose:
print('\tIteration\tV-variation') print(' Iteration\t\tV-variation')
self.time = time() self.time = time()
while True: while True:
...@@ -1413,16 +1407,15 @@ class ValueIteration(MDP): ...@@ -1413,16 +1407,15 @@ class ValueIteration(MDP):
variation = getSpan(self.V - Vprev) variation = getSpan(self.V - Vprev)
if self.verbose: if self.verbose:
print(("\t%s\t%s" % (self.iter, variation))) print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.thresh: if variation < self.thresh:
if self.verbose: if self.verbose:
print("Iteration stopped, epsilon-optimal policy found.") print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
break break
elif (self.iter == self.max_iter): elif (self.iter == self.max_iter):
if self.verbose: if self.verbose:
print("Iteration stopped by maximum number of iterations " print(MSG_STOP_MAX_ITER)
"condition.")
break break
# store value and policy as tuples # store value and policy as tuples
...@@ -1524,7 +1517,7 @@ class ValueIterationGS(ValueIteration): ...@@ -1524,7 +1517,7 @@ class ValueIterationGS(ValueIteration):
done = False done = False
if self.verbose: if self.verbose:
print(' Iteration V_variation') print(' Iteration\t\tV-variation')
self.time = time() self.time = time()
...@@ -1543,18 +1536,16 @@ class ValueIterationGS(ValueIteration): ...@@ -1543,18 +1536,16 @@ class ValueIterationGS(ValueIteration):
variation = getSpan(self.V - Vprev) variation = getSpan(self.V - Vprev)
if self.verbose: if self.verbose:
print((" %s %s" % (self.iter, variation))) print((" %s\t\t %s" % (self.iter, variation)))
if variation < self.thresh: if variation < self.thresh:
done = True done = True
if self.verbose: if self.verbose:
print("Iterations stopped, epsilon-optimal policy found.") print(MSG_STOP_EPSILON_OPTIMAL_POLICY)
elif self.iter == self.max_iter: elif self.iter == self.max_iter:
done = True done = True
if self.verbose: if self.verbose:
print("Iterations stopped by maximum number of iteration " print(MSG_STOP_MAX_ITER)
"condition.")
self.policy = [] self.policy = []
for s in range(self.S): for s in range(self.S):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment