Commit 299b4b4d authored by Steven Cordwell's avatar Steven Cordwell
Browse files

[import] Add underscore before imported modules

Import modules with an underscore in front of their names. For example
import numpy as _np. This is to address the namespace getting to full.
parent b9d05b75
...@@ -43,9 +43,8 @@ rand ...@@ -43,9 +43,8 @@ rand
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from numpy import diag, ones, where, zeros import numpy as _np
from numpy.random import randint, random import scipy.sparse as _sp
from scipy.sparse import coo_matrix, dok_matrix
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
"""Generate a MDP example based on a simple forest management scenario. """Generate a MDP example based on a simple forest management scenario.
...@@ -165,22 +164,22 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): ...@@ -165,22 +164,22 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
rows = list(range(S)) * 2 rows = list(range(S)) * 2
cols = [0] * S + list(range(1, S)) + [S - 1] cols = [0] * S + list(range(1, S)) + [S - 1]
vals = [p] * S + [1-p] * S vals = [p] * S + [1-p] * S
P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr()) P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
rows = list(range(S)) rows = list(range(S))
cols = [0] * S cols = [0] * S
vals = [1] * S vals = [1] * S
P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr()) P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
else: else:
P = zeros((2, S, S)) P = _np.zeros((2, S, S))
P[0, :, :] = (1 - p) * diag(ones(S - 1), 1) P[0, :, :] = (1 - p) * _np.diag(_np.ones(S - 1), 1)
P[0, :, 0] = p P[0, :, 0] = p
P[0, S - 1, S - 1] = (1 - p) P[0, S - 1, S - 1] = (1 - p)
P[1, :, :] = zeros((S, S)) P[1, :, :] = _np.zeros((S, S))
P[1, :, 0] = 1 P[1, :, 0] = 1
# Definition of Reward matrix # Definition of Reward matrix
R = zeros((S, 2)) R = _np.zeros((S, 2))
R[S - 1, 0] = r1 R[S - 1, 0] = r1
R[:, 1] = ones(S) R[:, 1] = _np.ones(S)
R[0, 1] = 0 R[0, 1] = 0
R[S - 1, 1] = r2 R[S - 1, 1] = r2
# we want to return the generated transition and reward matrices # we want to return the generated transition and reward matrices
...@@ -284,11 +283,11 @@ def rand(S, A, is_sparse=False, mask=None): ...@@ -284,11 +283,11 @@ def rand(S, A, is_sparse=False, mask=None):
# it may be more efficient to implement this by constructing lists # it may be more efficient to implement this by constructing lists
# of rows, columns and values then creating a coo_matrix, but this # of rows, columns and values then creating a coo_matrix, but this
# works for now # works for now
PP = dok_matrix((S, S)) PP = _sp.dok_matrix((S, S))
RR = dok_matrix((S, S)) RR = _sp.dok_matrix((S, S))
for s in range(S): for s in range(S):
if mask is None: if mask is None:
m = random(S) m = _np.random.random(S)
m[m <= 2/3.0] = 0 m[m <= 2/3.0] = 0
m[m > 2/3.0] = 1 m[m > 2/3.0] = 1
elif mask.shape == (A, S, S): elif mask.shape == (A, S, S):
...@@ -297,7 +296,7 @@ def rand(S, A, is_sparse=False, mask=None): ...@@ -297,7 +296,7 @@ def rand(S, A, is_sparse=False, mask=None):
m = mask[s] m = mask[s]
n = int(m.sum()) # m[s, :] n = int(m.sum()) # m[s, :]
if n == 0: if n == 0:
m[randint(0, S)] = 1 m[_np.random.randint(0, S)] = 1
n = 1 n = 1
# find the columns of the vector that have non-zero elements # find the columns of the vector that have non-zero elements
nz = m.nonzero() nz = m.nonzero()
...@@ -305,9 +304,9 @@ def rand(S, A, is_sparse=False, mask=None): ...@@ -305,9 +304,9 @@ def rand(S, A, is_sparse=False, mask=None):
cols = nz[0] cols = nz[0]
else: else:
cols = nz[1] cols = nz[1]
vals = random(n) vals = _np.random.random(n)
vals = vals / vals.sum() vals = vals / vals.sum()
reward = 2*random(n) - ones(n) reward = 2*_np.random.random(n) - _np.ones(n)
PP[s, cols] = vals PP[s, cols] = vals
RR[s, cols] = reward RR[s, cols] = reward
# PP.tocsr() takes the same amount of time as PP.tocoo().tocsr() # PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
...@@ -317,15 +316,15 @@ def rand(S, A, is_sparse=False, mask=None): ...@@ -317,15 +316,15 @@ def rand(S, A, is_sparse=False, mask=None):
R[a] = RR.tocsr() R[a] = RR.tocsr()
else: else:
# definition of transition matrix : square stochastic matrix # definition of transition matrix : square stochastic matrix
P = zeros((A, S, S)) P = _np.zeros((A, S, S))
# definition of reward matrix (values between -1 and +1) # definition of reward matrix (values between -1 and +1)
R = zeros((A, S, S)) R = _np.zeros((A, S, S))
for a in range(A): for a in range(A):
for s in range(S): for s in range(S):
# create our own random mask if there is no user supplied one # create our own random mask if there is no user supplied one
if mask is None: if mask is None:
m = random(S) m = _np.random.random(S)
r = random() r = _np.random.random()
m[m <= r] = 0 m[m <= r] = 0
m[m > r] = 1 m[m > r] = 1
elif mask.shape == (A, S, S): elif mask.shape == (A, S, S):
...@@ -334,9 +333,10 @@ def rand(S, A, is_sparse=False, mask=None): ...@@ -334,9 +333,10 @@ def rand(S, A, is_sparse=False, mask=None):
m = mask[s] m = mask[s]
# Make sure that there is atleast one transition in each state # Make sure that there is atleast one transition in each state
if m.sum() == 0: if m.sum() == 0:
m[randint(0, S)] = 1 m[_np.random.randint(0, S)] = 1
P[a][s] = m * random(S) P[a][s] = m * _np.random.random(S)
P[a][s] = P[a][s] / P[a][s].sum() P[a][s] = P[a][s] / P[a][s].sum()
R[a][s] = (m * (2*random(S) - ones(S, dtype=int))) R[a][s] = (m * (2 * _np.random.random(S) -
_np.ones(S, dtype=int)))
# we want to return the generated transition and reward matrices # we want to return the generated transition and reward matrices
return(P, R) return(P, R)
...@@ -57,12 +57,11 @@ ValueIterationGS ...@@ -57,12 +57,11 @@ ValueIterationGS
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from math import ceil, log, sqrt import math as _math
from time import time import time as _time
from numpy import absolute, array, empty, mean, mod, multiply, ones, zeros import numpy as _np
from numpy.random import randint, random import scipy.sparse as _sp
from scipy.sparse import csr_matrix as sparse
try: try:
from .util import check, getSpan from .util import check, getSpan
...@@ -70,13 +69,13 @@ except ValueError: ...@@ -70,13 +69,13 @@ except ValueError:
# importing mdp as a module rather than as part of a package # importing mdp as a module rather than as part of a package
from util import check, getSpan from util import check, getSpan
MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \ _MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
"condition." "condition."
MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \ _MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
"policy found." "policy found."
MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \ _MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
"function found." "function found."
MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found." _MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
class MDP(object): class MDP(object):
...@@ -228,7 +227,7 @@ class MDP(object): ...@@ -228,7 +227,7 @@ class MDP(object):
# P and V can be any object that supports indexing, so it is important # P and V can be any object that supports indexing, so it is important
# that you know they define a valid MDP before calling the # that you know they define a valid MDP before calling the
# _bellmanOperator method. Otherwise the results will be meaningless. # _bellmanOperator method. Otherwise the results will be meaningless.
Q = empty((self.A, self.S)) Q = _np.empty((self.A, self.S))
for aa in range(self.A): for aa in range(self.A):
Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V) Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
# Get the policy and value, for now it is being returned but... # Get the policy and value, for now it is being returned but...
...@@ -278,20 +277,20 @@ class MDP(object): ...@@ -278,20 +277,20 @@ class MDP(object):
# vector. # vector.
try: try:
if R.ndim == 1: if R.ndim == 1:
r = array(R).reshape(self.S) r = _np.array(R).reshape(self.S)
self.R = tuple(r for aa in range(self.A)) self.R = tuple(r for aa in range(self.A))
elif R.ndim == 2: elif R.ndim == 2:
self.R = tuple(array(R[:, aa]).reshape(self.S) self.R = tuple(_np.array(R[:, aa]).reshape(self.S)
for aa in range(self.A)) for aa in range(self.A))
else: else:
self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S) self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
for aa in range(self.A)) for aa in range(self.A))
except AttributeError: except AttributeError:
if len(R) == self.A: if len(R) == self.A:
self.R = tuple(multiply(P[aa], R[aa]).sum(1).reshape(self.S) self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S)
for aa in range(self.A)) for aa in range(self.A))
else: else:
r = array(R).reshape(self.S) r = _np.array(R).reshape(self.S)
self.R = tuple(r for aa in range(self.A)) self.R = tuple(r for aa in range(self.A))
def run(self): def run(self):
...@@ -369,10 +368,10 @@ class FiniteHorizon(MDP): ...@@ -369,10 +368,10 @@ class FiniteHorizon(MDP):
# induction # induction
del self.iter del self.iter
# There are value vectors for each time step up to the horizon # There are value vectors for each time step up to the horizon
self.V = zeros((self.S, N + 1)) self.V = _np.zeros((self.S, N + 1))
# There are policy vectors for each time step before the horizon, when # There are policy vectors for each time step before the horizon, when
# we reach the horizon we don't need to make decisions anymore. # we reach the horizon we don't need to make decisions anymore.
self.policy = empty((self.S, N), dtype=int) self.policy = _np.empty((self.S, N), dtype=int)
# Set the reward for the final transition to h, if specified. # Set the reward for the final transition to h, if specified.
if h is not None: if h is not None:
self.V[:, N] = h self.V[:, N] = h
...@@ -381,7 +380,7 @@ class FiniteHorizon(MDP): ...@@ -381,7 +380,7 @@ class FiniteHorizon(MDP):
def run(self): def run(self):
# Run the finite horizon algorithm. # Run the finite horizon algorithm.
self.time = time() self.time = _time.time()
# loop through each time period # loop through each time period
for n in range(self.N): for n in range(self.N):
W, X = self._bellmanOperator(self.V[:, self.N - n]) W, X = self._bellmanOperator(self.V[:, self.N - n])
...@@ -392,7 +391,7 @@ class FiniteHorizon(MDP): ...@@ -392,7 +391,7 @@ class FiniteHorizon(MDP):
print(("stage: %s, policy: %s") % ( print(("stage: %s, policy: %s") % (
stage, self.policy[:, stage].tolist())) stage, self.policy[:, stage].tolist()))
# update time spent running # update time spent running
self.time = time() - self.time self.time = _time.time() - self.time
# After this we could create a tuple of tuples for the values and # After this we could create a tuple of tuples for the values and
# policies. # policies.
#self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N)) #self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
...@@ -447,10 +446,6 @@ class LP(MDP): ...@@ -447,10 +446,6 @@ class LP(MDP):
except ImportError: except ImportError:
raise ImportError("The python module cvxopt is required to use " raise ImportError("The python module cvxopt is required to use "
"linear programming functionality.") "linear programming functionality.")
# we also need diagonal matrices, and using a sparse one may be more
# memory efficient
from scipy.sparse import eye as speye
self._speye = speye
# initialise the MDP. epsilon and max_iter are not needed # initialise the MDP. epsilon and max_iter are not needed
MDP.__init__(self, transitions, reward, discount, None, None) MDP.__init__(self, transitions, reward, discount, None, None)
# Set the cvxopt solver to be quiet by default, but ... # Set the cvxopt solver to be quiet by default, but ...
...@@ -462,7 +457,7 @@ class LP(MDP): ...@@ -462,7 +457,7 @@ class LP(MDP):
def run(self): def run(self):
#Run the linear programming algorithm. #Run the linear programming algorithm.
self.time = time() self.time = _time.time()
# The objective is to resolve : min V / V >= PR + discount*P*V # The objective is to resolve : min V / V >= PR + discount*P*V
# The function linprog of the optimisation Toolbox of Mathworks # The function linprog of the optimisation Toolbox of Mathworks
# resolves : # resolves :
...@@ -471,23 +466,23 @@ class LP(MDP): ...@@ -471,23 +466,23 @@ class LP(MDP):
# min V / (discount*P-I) * V <= - PR # min V / (discount*P-I) * V <= - PR
# To avoid loop on states, the matrix M is structured following actions # To avoid loop on states, the matrix M is structured following actions
# M(A*S,S) # M(A*S,S)
f = self._cvxmat(ones((self.S, 1))) f = self._cvxmat(_np.ones((self.S, 1)))
h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d') h = self._cvxmat(self.R.reshape(self.S * self.A, 1, order="F"), tc='d')
M = zeros((self.A * self.S, self.S)) M = _np.zeros((self.A * self.S, self.S))
for aa in range(self.A): for aa in range(self.A):
pos = (aa + 1) * self.S pos = (aa + 1) * self.S
M[(pos - self.S):pos, :] = ( M[(pos - self.S):pos, :] = (
self.discount * self.P[aa] - self._speye(self.S, self.S)) self.discount * self.P[aa] - _sp.eye(self.S, self.S))
M = self._cvxmat(M) M = self._cvxmat(M)
# Using the glpk option will make this behave more like Octave # Using the glpk option will make this behave more like Octave
# (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the # (Octave uses glpk) and perhaps Matlab. If solver=None (ie using the
# default cvxopt solver) then V agrees with the Octave equivalent # default cvxopt solver) then V agrees with the Octave equivalent
# only to 10e-8 places. This assumes glpk is installed of course. # only to 10e-8 places. This assumes glpk is installed of course.
self.V = array(self._linprog(f, M, -h, solver='glpk')['x']) self.V = _np.array(self._linprog(f, M, -h, solver='glpk')['x'])
# apply the Bellman operator # apply the Bellman operator
self.policy, self.V = self._bellmanOperator() self.policy, self.V = self._bellmanOperator()
# update the time spent solving # update the time spent solving
self.time = time() - self.time self.time = _time.time() - self.time
# store value and policy as tuples # store value and policy as tuples
self.V = tuple(self.V.tolist()) self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist()) self.policy = tuple(self.policy.tolist())
...@@ -559,13 +554,13 @@ class PolicyIteration(MDP): ...@@ -559,13 +554,13 @@ class PolicyIteration(MDP):
if policy0 == None: if policy0 == None:
# Initialise the policy to the one which maximises the expected # Initialise the policy to the one which maximises the expected
# immediate reward # immediate reward
null = zeros(self.S) null = _np.zeros(self.S)
self.policy, null = self._bellmanOperator(null) self.policy, null = self._bellmanOperator(null)
del null del null
else: else:
# Use the policy that the user supplied # Use the policy that the user supplied
# Make sure it is a numpy array # Make sure it is a numpy array
policy0 = array(policy0) policy0 = _np.array(policy0)
# Make sure the policy is the right size and shape # Make sure the policy is the right size and shape
assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \ assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
"'policy0' must a vector with length S." "'policy0' must a vector with length S."
...@@ -573,18 +568,14 @@ class PolicyIteration(MDP): ...@@ -573,18 +568,14 @@ class PolicyIteration(MDP):
policy0 = policy0.reshape(self.S) policy0 = policy0.reshape(self.S)
# The policy can only contain integers between 0 and S-1 # The policy can only contain integers between 0 and S-1
msg = "'policy0' must be a vector of integers between 0 and S-1." msg = "'policy0' must be a vector of integers between 0 and S-1."
assert not mod(policy0, 1).any(), msg assert not _np.mod(policy0, 1).any(), msg
assert (policy0 >= 0).all(), msg assert (policy0 >= 0).all(), msg
assert (policy0 < self.S).all(), msg assert (policy0 < self.S).all(), msg
self.policy = policy0 self.policy = policy0
# set the initial values to zero # set the initial values to zero
self.V = zeros(self.S) self.V = _np.zeros(self.S)
# Do some setup depending on the evaluation type # Do some setup depending on the evaluation type
if eval_type in (0, "matrix"): if eval_type in (0, "matrix"):
from numpy.linalg import solve
from scipy.sparse import eye
self._speye = eye
self._lin_eq = solve
self.eval_type = "matrix" self.eval_type = "matrix"
elif eval_type in (1, "iterative"): elif eval_type in (1, "iterative"):
self.eval_type = "iterative" self.eval_type = "iterative"
...@@ -615,8 +606,8 @@ class PolicyIteration(MDP): ...@@ -615,8 +606,8 @@ class PolicyIteration(MDP):
# Ppolicy(SxS) = transition matrix for policy # Ppolicy(SxS) = transition matrix for policy
# PRpolicy(S) = reward matrix for policy # PRpolicy(S) = reward matrix for policy
# #
Ppolicy = empty((self.S, self.S)) Ppolicy = _np.empty((self.S, self.S))
Rpolicy = zeros(self.S) Rpolicy = _np.zeros(self.S)
for aa in range(self.A): # avoid looping over S for aa in range(self.A): # avoid looping over S
# the rows that use action a. # the rows that use action a.
ind = (self.policy == aa).nonzero()[0] ind = (self.policy == aa).nonzero()[0]
...@@ -634,8 +625,8 @@ class PolicyIteration(MDP): ...@@ -634,8 +625,8 @@ class PolicyIteration(MDP):
# it should be possible in the future. Also, if R is so big that its # it should be possible in the future. Also, if R is so big that its
# a good idea to use a sparse matrix for it, then converting PRpolicy # a good idea to use a sparse matrix for it, then converting PRpolicy
# from a dense to sparse matrix doesn't seem very memory efficient # from a dense to sparse matrix doesn't seem very memory efficient
if type(self.R) is sparse: if type(self.R) is _sp.csr_matrix:
Rpolicy = sparse(Rpolicy) Rpolicy = _sp.csr_matrix(Rpolicy)
#self.Ppolicy = Ppolicy #self.Ppolicy = Ppolicy
#self.Rpolicy = Rpolicy #self.Rpolicy = Rpolicy
return (Ppolicy, Rpolicy) return (Ppolicy, Rpolicy)
...@@ -674,12 +665,12 @@ class PolicyIteration(MDP): ...@@ -674,12 +665,12 @@ class PolicyIteration(MDP):
try: try:
assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \ assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
"'V0' must be a vector of length S." "'V0' must be a vector of length S."
policy_V = array(V0).reshape(self.S) policy_V = _np.array(V0).reshape(self.S)
except AttributeError: except AttributeError:
if V0 == 0: if V0 == 0:
policy_V = zeros(self.S) policy_V = _np.zeros(self.S)
else: else:
policy_V = array(V0).reshape(self.S) policy_V = _np.array(V0).reshape(self.S)
policy_P, policy_R = self._computePpolicyPRpolicy() policy_P, policy_R = self._computePpolicyPRpolicy()
...@@ -694,7 +685,7 @@ class PolicyIteration(MDP): ...@@ -694,7 +685,7 @@ class PolicyIteration(MDP):
Vprev = policy_V Vprev = policy_V
policy_V = policy_R + self.discount * policy_P.dot(Vprev) policy_V = policy_R + self.discount * policy_P.dot(Vprev)
variation = absolute(policy_V - Vprev).max() variation = _np.absolute(policy_V - Vprev).max()
if self.verbose: if self.verbose:
print((' %s\t\t %s') % (itr, variation)) print((' %s\t\t %s') % (itr, variation))
...@@ -702,11 +693,11 @@ class PolicyIteration(MDP): ...@@ -702,11 +693,11 @@ class PolicyIteration(MDP):
if variation < ((1 - self.discount) / self.discount) * epsilon: if variation < ((1 - self.discount) / self.discount) * epsilon:
done = True done = True
if self.verbose: if self.verbose:
print(MSG_STOP_EPSILON_OPTIMAL_VALUE) print(_MSG_STOP_EPSILON_OPTIMAL_VALUE)
elif itr == max_iter: elif itr == max_iter:
done = True done = True
if self.verbose: if self.verbose:
print(MSG_STOP_MAX_ITER) print(_MSG_STOP_MAX_ITER)
self.V = policy_V self.V = policy_V
...@@ -732,8 +723,8 @@ class PolicyIteration(MDP): ...@@ -732,8 +723,8 @@ class PolicyIteration(MDP):
# #
Ppolicy, Rpolicy = self._computePpolicyPRpolicy() Ppolicy, Rpolicy = self._computePpolicyPRpolicy()
# V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR # V = PR + gPV => (I-gP)V = PR => V = inv(I-gP)* PR
self.V = self._lin_eq( self.V = _np.linalg.solve(
(self._speye(self.S, self.S) - self.discount * Ppolicy), Rpolicy) (_sp.eye(self.S, self.S) - self.discount * Ppolicy), Rpolicy)
def run(self): def run(self):
# Run the policy iteration algorithm. # Run the policy iteration algorithm.
...@@ -742,7 +733,7 @@ class PolicyIteration(MDP): ...@@ -742,7 +733,7 @@ class PolicyIteration(MDP):
print(' Iteration\t\tNumber of different actions') print(' Iteration\t\tNumber of different actions')
# Set up the while stopping condition and the current time # Set up the while stopping condition and the current time
done = False done = False
self.time = time() self.time = _time.time()
# loop until a stopping condition is reached # loop until a stopping condition is reached
while not done: while not done:
self.iter += 1 self.iter += 1
...@@ -767,15 +758,15 @@ class PolicyIteration(MDP): ...@@ -767,15 +758,15 @@ class PolicyIteration(MDP):
if n_different == 0: if n_different == 0:
done = True done = True
if self.verbose: if self.verbose:
print(MSG_STOP_UNCHANGING_POLICY) print(_MSG_STOP_UNCHANGING_POLICY)
elif (self.iter == self.max_iter): elif (self.iter == self.max_iter):
done = True done = True
if self.verbose: if self.verbose:
print(MSG_STOP_MAX_ITER) print(_MSG_STOP_MAX_ITER)
else: else:
self.policy = policy_next self.policy = policy_next
# update the time to return th computation time # update the time to return th computation time
self.time = time() - self.time self.time = _time.time() - self.time
# store value and policy as tuples # store value and policy as tuples
self.V = tuple(self.V.tolist()) self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist()) self.policy = tuple(self.policy.tolist())
...@@ -851,10 +842,10 @@ class PolicyIterationModified(PolicyIteration): ...@@ -851,10 +842,10 @@ class PolicyIterationModified(PolicyIteration):
self.thresh = self.epsilon self.thresh = self.epsilon
if self.discount == 1: if self.discount == 1:
self.V = zeros((self.S, 1)) self.V = _np.zeros((self.S, 1))
else: else:
Rmin = min(R.min() for R in self.R) Rmin = min(R.min() for R in self.R)
self.V = 1 / (1 - self.discount) * Rmin * ones((self.S,)) self.V = 1 / (1 - self.discount) * Rmin * _np.ones((self.S,))
# Call the iteration method # Call the iteration method
#self.run() #self.run()
...@@ -865,7 +856,7 @@ class PolicyIterationModified(PolicyIteration): ...@@ -865,7 +856,7 @@ class PolicyIterationModified(PolicyIteration):
if self.verbose: if self.verbose:
print(' \tIteration\t\tV-variation') print(' \tIteration\t\tV-variation')
self.time = time() self.time = _time.time()
done = False done = False
while not done: while not done:
...@@ -892,7 +883,7 @@ class PolicyIterationModified(PolicyIteration): ...@@ -892,7 +883,7 @@ class PolicyIterationModified(PolicyIteration):
if is_verbose: if is_verbose:
self.setVerbose() self.setVerbose()