Commit 32fb5783 authored by Steven Cordwell's avatar Steven Cordwell
Browse files

Fix some formatting mistakes

Make the code more compliant to PEP 8.
parent 507ba06b
......@@ -18,7 +18,7 @@ Available modules
How to use the documentation
----------------------------
Documentation is available both as docstrings provided with the code and
in html or pdf format from
in html or pdf format from
`The MDP toolbox homepage <http://www.somewhere.com>`_. The docstring
examples assume that the ``mdptoolbox`` package is imported like so::
......@@ -45,19 +45,19 @@ source code use ``mdp.ValueIteration??<ENTER>``.
Acknowledgments
---------------
This module is modified from the MDPtoolbox (c) 2009 INRA available at
This module is modified from the MDPtoolbox (c) 2009 INRA available at
http://www.inra.fr/mia/T/MDPtoolbox/.
"""
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
#
#
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
......@@ -66,7 +66,7 @@ http://www.inra.fr/mia/T/MDPtoolbox/.
# * Neither the name of the <ORGANIZATION> nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
......
......@@ -49,6 +49,7 @@ Available functions
import numpy as _np
import scipy.sparse as _sp
def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
"""Generate a MDP example based on a simple forest management scenario.
......@@ -187,6 +188,7 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
R[S - 1, 1] = r2
return(P, R)
def _randDense(states, actions, mask):
"""Generate random dense ``P`` and ``R``. See ``rand`` for details.
......@@ -204,7 +206,7 @@ def _randDense(states, actions, mask):
m[m <= r] = 0
m[m > r] = 1
elif mask.shape == (actions, states, states):
m = mask[action][state] # mask[action, state, :]
m = mask[action][state] # mask[action, state, :]
else:
m = mask[state]
# Make sure that there is atleast one transition in each state
......@@ -216,6 +218,7 @@ def _randDense(states, actions, mask):
_np.ones(states, dtype=int)))
return(P, R)
def _randSparse(states, actions, mask):
"""Generate random sparse ``P`` and ``R``. See ``rand`` for details.
......@@ -236,10 +239,10 @@ def _randSparse(states, actions, mask):
m[m <= 2/3.0] = 0
m[m > 2/3.0] = 1
elif mask.shape == (actions, states, states):
m = mask[action][state] # mask[action, state, :]
m = mask[action][state] # mask[action, state, :]
else:
m = mask[state]
n = int(m.sum()) # m[state, :]
n = int(m.sum()) # m[state, :]
if n == 0:
m[_np.random.randint(0, states)] = 1
n = 1
......@@ -261,6 +264,7 @@ def _randSparse(states, actions, mask):
R[action] = RR.tocsr()
return(P, R)
def rand(S, A, is_sparse=False, mask=None):
"""Generate a random Markov Decision Process.
......@@ -345,8 +349,9 @@ def rand(S, A, is_sparse=False, mask=None):
if mask is not None:
# the mask needs to be SxS or AxSxS
try:
assert mask.shape in ((S, S), (A, S, S)), "'mask' must have " \
"dimensions S×S or A×S×S."
assert mask.shape in ((S, S), (A, S, S)), (
"'mask' must have dimensions S×S or A×S×S."
)
except AttributeError:
raise TypeError("'mask' must be a numpy array or matrix.")
# generate the transition and reward matrices based on S, A and mask
......@@ -356,6 +361,7 @@ def rand(S, A, is_sparse=False, mask=None):
P, R = _randDense(S, A, mask)
return(P, R)
def small():
"""A very small Markov decision process.
......
......@@ -71,6 +71,7 @@ _MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
"function found."
_MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."
def _computeDimensions(transition):
A = len(transition)
try:
......@@ -82,6 +83,7 @@ def _computeDimensions(transition):
S = transition[0].shape[0]
return S, A
class MDP(object):
"""A Markov Decision Problem.
......@@ -176,7 +178,9 @@ class MDP(object):
# in its computations
if discount is not None:
self.discount = float(discount)
assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
assert 0.0 < self.discount <= 1.0, (
"Discount rate must be in ]0; 1]"
)
if self.discount == 1:
print("WARNING: check conditions of convergence. With no "
"discount, convergence can not be assumed.")
......@@ -185,8 +189,9 @@ class MDP(object):
# in its computations
if max_iter is not None:
self.max_iter = int(max_iter)
assert self.max_iter > 0, "The maximum number of iterations " \
"must be greater than 0."
assert self.max_iter > 0, (
"The maximum number of iterations must be greater than 0."
)
# check that epsilon is something sane
if epsilon is not None:
......@@ -194,8 +199,9 @@ class MDP(object):
assert self.epsilon > 0, "Epsilon must be greater than 0."
if not skip_check:
# We run a check on P and R to make sure they are describing an MDP.
# If an exception isn't raised then they are assumed to be correct.
# We run a check on P and R to make sure they are describing an
# MDP. If an exception isn't raised then they are assumed to be
# correct.
_util.check(transitions, reward)
self.S, self.A = _computeDimensions(transitions)
......@@ -293,17 +299,19 @@ class MDP(object):
if _sp.issparse(reward):
raise NotImplementedError
else:
func = lambda x: _np.array(x).reshape(self.S)
def func(x):
return _np.array(x).reshape(self.S)
return tuple(func(reward[:, a]) for a in range(self.A))
def _computeMatrixReward(self, reward, transition):
if _sp.issparse(reward):
# An approach like this might be more memory efficeint
#reward.data = reward.data * transition[reward.nonzero()]
#return reward.sum(1).A.reshape(self.S)
# reward.data = reward.data * transition[reward.nonzero()]
# return reward.sum(1).A.reshape(self.S)
# but doesn't work as it is.
return reward.multiply(transition).sum(1).A.reshape(self.S)
elif _sp.issparse(transition):
elif _sp.issparse(transition):
return transition.multiply(reward).sum(1).A.reshape(self.S)
else:
return _np.multiply(transition, reward).sum(1).reshape(self.S)
......@@ -320,6 +328,7 @@ class MDP(object):
"""Set the MDP algorithm to verbose mode."""
self.verbose = True
class FiniteHorizon(MDP):
"""A MDP solved using the finite-horizon backwards induction algorithm.
......@@ -413,10 +422,11 @@ class FiniteHorizon(MDP):
self.time = _time.time() - self.time
# After this we could create a tuple of tuples for the values and
# policies.
#self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
#self.policy = tuple(tuple(self.policy[:, n].tolist())
# self.V = tuple(tuple(self.V[:, n].tolist()) for n in range(self.N))
# self.policy = tuple(tuple(self.policy[:, n].tolist())
# for n in range(self.N))
class _LP(MDP):
"""A discounted MDP soloved using linear programming.
......@@ -485,7 +495,7 @@ class _LP(MDP):
solvers.options['show_progress'] = False
def run(self):
#Run the linear programming algorithm.
# Run the linear programming algorithm.
self.time = _time.time()
# The objective is to resolve : min V / V >= PR + discount*P*V
# The function linprog of the optimisation Toolbox of Mathworks
......@@ -517,6 +527,7 @@ class _LP(MDP):
self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist())
class PolicyIteration(MDP):
"""A discounted MDP solved using the policy iteration algorithm.
......@@ -642,7 +653,7 @@ class PolicyIteration(MDP):
#
Ppolicy = _np.empty((self.S, self.S))
Rpolicy = _np.zeros(self.S)
for aa in range(self.A): # avoid looping over S
for aa in range(self.A): # avoid looping over S
# the rows that use action a.
ind = (self.policy == aa).nonzero()[0]
# if no rows use action a, then no need to assign this
......@@ -651,7 +662,7 @@ class PolicyIteration(MDP):
Ppolicy[ind, :] = self.P[aa][ind, :]
except ValueError:
Ppolicy[ind, :] = self.P[aa][ind, :].todense()
#PR = self._computePR() # an apparently uneeded line, and
# PR = self._computePR() # an apparently uneeded line, and
# perhaps harmful in this implementation c.f.
# mdp_computePpolicyPRpolicy.m
Rpolicy[ind] = self.R[aa][ind]
......@@ -661,8 +672,8 @@ class PolicyIteration(MDP):
# from a dense to sparse matrix doesn't seem very memory efficient
if type(self.R) is _sp.csr_matrix:
Rpolicy = _sp.csr_matrix(Rpolicy)
#self.Ppolicy = Ppolicy
#self.Rpolicy = Rpolicy
# self.Ppolicy = Ppolicy
# self.Rpolicy = Rpolicy
return (Ppolicy, Rpolicy)
def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
......@@ -746,8 +757,8 @@ class PolicyIteration(MDP):
# each cell containing a matrix (SxS) possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or
# a cell array (1xA), each cell containing a sparse matrix (SxS) or
# a 2D array(SxA) possibly sparse
# a cell array (1xA), each cell containing a sparse matrix (SxS)
# or a 2D array(SxA) possibly sparse
# discount = discount rate in ]0; 1[
# policy(S) = a policy
#
......@@ -805,6 +816,7 @@ class PolicyIteration(MDP):
self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist())
class PolicyIterationModified(PolicyIteration):
"""A discounted MDP solved using a modifified policy iteration algorithm.
......@@ -899,7 +911,7 @@ class PolicyIterationModified(PolicyIteration):
self.iter += 1
self.policy, Vnext = self._bellmanOperator()
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
# [Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation = _util.getSpan(Vnext - self.V)
if self.verbose:
......@@ -925,6 +937,7 @@ class PolicyIterationModified(PolicyIteration):
self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist())
class QLearning(MDP):
"""A discounted MDP solved using the Q learning algorithm.
......@@ -1008,8 +1021,8 @@ class QLearning(MDP):
assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."
if not skip_check:
# We don't want to send this to MDP because _computePR should not be
# run on it, so check that it defines an MDP
# We don't want to send this to MDP because _computePR should not
# be run on it, so check that it defines an MDP
_util.check(transitions, reward)
# Store P, S, and A
......@@ -1091,6 +1104,7 @@ class QLearning(MDP):
self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist())
class RelativeValueIteration(MDP):
"""A MDP solved using the relative value iteration algorithm.
......@@ -1171,7 +1185,7 @@ class RelativeValueIteration(MDP):
self.discount = 1
self.V = _np.zeros(self.S)
self.gain = 0 # self.U[self.S]
self.gain = 0 # self.U[self.S]
self.average_reward = None
......@@ -1216,6 +1230,7 @@ class RelativeValueIteration(MDP):
self.V = tuple(self.V.tolist())
self.policy = tuple(self.policy.tolist())
class ValueIteration(MDP):
"""A discounted MDP solved using the value iteration algorithm.
......@@ -1354,7 +1369,7 @@ class ValueIteration(MDP):
# computation of threshold of variation for V for an epsilon-
# optimal policy
self.thresh = epsilon * (1 - self.discount) / self.discount
else: # discount == 1
else: # discount == 1
# threshold of variation for V for an epsilon-optimal policy
self.thresh = epsilon
......@@ -1399,8 +1414,8 @@ class ValueIteration(MDP):
# p 201, Proposition 6.6.5
span = _util.getSpan(value - Vprev)
max_iter = (_math.log((epsilon * (1 - self.discount) / self.discount) /
span ) / _math.log(self.discount * k))
#self.V = Vprev
span) / _math.log(self.discount * k))
# self.V = Vprev
self.max_iter = int(_math.ceil(max_iter))
......@@ -1442,6 +1457,7 @@ class ValueIteration(MDP):
self.time = _time.time() - self.time
class ValueIterationGS(ValueIteration):
"""
......@@ -1528,7 +1544,7 @@ class ValueIterationGS(ValueIteration):
# computation of threshold of variation for V for an epsilon-
# optimal policy
self.thresh = epsilon * (1 - self.discount) / self.discount
else: # discount == 1
else: # discount == 1
# threshold of variation for V for an epsilon-optimal policy
self.thresh = epsilon
......@@ -1548,7 +1564,7 @@ class ValueIterationGS(ValueIteration):
Vprev = self.V.copy()
for s in range(self.S):
Q = [float(self.R[a][s]+
Q = [float(self.R[a][s] +
self.discount * self.P[a][s, :].dot(self.V))
for a in range(self.A)]
......@@ -1572,7 +1588,8 @@ class ValueIterationGS(ValueIteration):
for s in range(self.S):
Q = _np.zeros(self.A)
for a in range(self.A):
Q[a] = self.R[a][s] + self.discount * self.P[a][s, :].dot(self.V)
Q[a] = (self.R[a][s] +
self.discount * self.P[a][s, :].dot(self.V))
self.V[s] = Q.max()
self.policy.append(int(Q.argmax()))
......
......@@ -90,6 +90,7 @@ _MDPERR = {
"actions greater than 0. i.e. R.shape = (S, A) or (A, S, S)."
}
def _checkDimensionsListLike(arrays):
"""Check that each array in a list of arrays has the same size.
......@@ -102,6 +103,7 @@ def _checkDimensionsListLike(arrays):
raise _error.InvalidError(_MDPERR["obj_square"])
return dim1, dim2, dim3
def _checkRewardsListLike(reward, n_actions, n_states):
"""Check that a list-like reward input is valid.
......@@ -119,6 +121,7 @@ def _checkRewardsListLike(reward, n_actions, n_states):
raise _error.InvalidError(_MDPERR["R_shape"])
return dim1, dim2, dim3
def isSquare(matrix):
"""Check that ``matrix`` is square.
......@@ -139,6 +142,7 @@ def isSquare(matrix):
return True
return False
def isStochastic(matrix):
"""Check that ``matrix`` is row stochastic.
......@@ -155,6 +159,7 @@ def isStochastic(matrix):
absdiff = (_np.abs(matrix.sum(axis=1) - _np.ones(matrix.shape[0])))
return (absdiff.max() <= 10*_np.spacing(_np.float64(1)))
def isNonNegative(matrix):
"""Check that ``matrix`` is row non-negative.
......@@ -177,6 +182,7 @@ def isNonNegative(matrix):
return True
return False
def checkSquareStochastic(matrix):
"""Check if ``matrix`` is a square and row-stochastic.
......@@ -206,6 +212,7 @@ def checkSquareStochastic(matrix):
if not isNonNegative(matrix):
raise _error.NonNegativeError
def check(P, R):
"""Check if ``P`` and ``R`` define a valid Markov Decision Process (MDP).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment