Commit a285341c authored by Steven Cordwell's avatar Steven Cordwell
Browse files

added new skeleton functions

parent 59bb9009
......@@ -32,17 +32,19 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from numpy import abs, array, ceil, log, matrix, ndarray, ones, zeros
from numpy import abs, array, matrix, ndarray, ones, zeros
from random import randint, random
from math import ceil, log, sqrt
from time import time
class MDP():
"""
"""
def bellmanOperator(self, Vprev):
"""Applies the Bellman operator on the value function Vprev.
"""The Markov Decision Problem Toolbox."""
def bellmanOperator(self):
"""Apply the Bellman operator on the value function.
Returns a new value function and a Vprev-improving policy
Arguments --------------------------------------------------------------
Parameters
--------------------------------------------------------------
Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions ora cell array (1xA),
......@@ -51,13 +53,14 @@ class MDP():
PR could be an array with 2 dimensions or a sparse matrix
discount = discount rate, in ]0, 1]
Vprev(S) = value function
Evaluation -------------------------------------------------------------
Returns
-------------------------------------------------------------
V(S) = new value function
policy(S) = Vprev-improving policy
"""
Q = matrix(zeros((self.S, self.A)))
for aa in range(self.A):
Q[:, aa] = self.R[:, aa] + (self.discount * self.P[aa] * Vprev)
Q[:, aa] = self.R[:, aa] + (self.discount * self.P[aa] * self.value)
# update the value and policy
self.value = Q.max(axis=1)
......@@ -70,7 +73,8 @@ class MDP():
The transition matrix P must be on the form P(AxSxS) and P[a,:,:]
must be stochastic
The reward matrix R must be on the form (SxSxA) or (SxA)
Arguments --------------------------------------------------------------
Arguments
--------------------------------------------------------------
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA),
each cell containing a matrix (SxS) possibly sparse
......@@ -78,7 +82,8 @@ class MDP():
R could be an array with 3 dimensions (SxSxA) or a cell array
(1xA), each cell containing a sparse matrix (SxS) or a 2D
array(SxA) possibly sparse
Evaluation -------------------------------------------------------------
Evaluation
-------------------------------------------------------------
is_mdp = True if P and R define a Markov Decision Process, False
otherwise
err_msg = error message or None if correct
......@@ -93,18 +98,18 @@ class MDP():
# be converted to an object array. A numpy object array is similar to a
# MATLAB cell array.
if (not type(P) is ndarray):
return(False, "The transition probability must be a numpy ndarray.")
return(False, "The transition probabilities must be a numpy array.")
elif ((type(P) is ndarray) and (not P.dtype is object) and (P.ndim != 3)):
return(False, "The transition probability array must have 3 dimensions: AxSxS.")
elif ((type(P) is ndarray) and (P.dtype is object) and (P.ndim > 1)):
return(False, "You are using and object array for the transition probability: The array must have only 1 dimension: A. Each element of the contains a SxS array.")
return(False, "You are using an object array for the transition probability: The array must have only 1 dimension: A. Each element of the contains a SxS array.")
if (not type(R) is ndarray):
return(False, "The reward must be a numpy ndarray.")
return(False, "The reward must be a numpy array.")
elif ((type(R) is ndarray) and (not R.dtype is object) and (not R.ndim in (2, 3))):
return(False, "The reward array must have 2 or 3 dimensions: AxSxS or SxA.")
elif ((type(R) is ndarray) and (R.dtype is object) and (R.ndim > 1)):
return(False, "You are using and object array for the reward: The array must have only 1 dimension: A. Each element of the contains a SxS array.")
return(False, "You are using an object array for the reward: The array must have only 1 dimension: A. Each element of the contains a SxS array.")
if (P.dtype is object):
P_is_object = True
......@@ -175,9 +180,11 @@ class MDP():
def checkSquareStochastic(self, Z):
"""Check if Z is a square stochastic matrix
Arguments --------------------------------------------------------------
Arguments
--------------------------------------------------------------
Z = a numpy ndarray SxS
Evaluation -------------------------------------------------------------
Evaluation
-------------------------------------------------------------
error_msg = error message or None if correct
"""
s1, s2 = Z.shape
......@@ -189,11 +196,17 @@ class MDP():
return('MDP Toolbox ERROR: Probabilities must be non-negative')
else:
return(None)
def computePpolicyPRpolicy(self):
"""Computes the transition matrix and the reward matrix for a policy.
"""
pass
def computePR(self, P, R):
"""Computes the reward for the system in one state chosing an action
Arguments --------------------------------------------------------------
Arguments
--------------------------------------------------------------
Let S = number of states, A = number of actions
P(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA),
......@@ -202,7 +215,8 @@ class MDP():
R could be an array with 3 dimensions (SxSxA) or a cell array
(1xA), each cell containing a sparse matrix (SxS) or a 2D
array(SxA) possibly sparse
Evaluation -------------------------------------------------------------
Evaluation
-------------------------------------------------------------
PR(SxA) = reward matrix
"""
# make P be an object array with (S, S) shaped array elements
......@@ -239,46 +253,231 @@ class MDP():
if (type(self.R) is ndarray):
self.R = matrix(self.R)
def silent(self):
"""Ask for running resolution functions of the MDP Toolbox in silent
mode.
"""
# self.verbose = False
pass
def span(self, W):
"""Returns the span of W
sp(W) = max W(s) - min W(s)
"""
return (W.max() - W.min())
def verbose(self):
"""Ask for running resolution functions of the MDP Toolbox in verbose
mode.
"""
# self.verbose = True
pass
class ValueIteration(MDP):
"""Resolution of discounted Markov Decision Problem with value iteration
class ExampleForest(MDP):
"""Generate a Markov Decision Process example based on a simple forest
management.
"""
pass
class ExampleRand(MDP):
"""Generate a random Markov Decision Process.
"""
pass
class FiniteHorizon(MDP):
"""Resolution of finite-horizon MDP with backwards induction.
"""
pass
class LP(MDP):
"""Resolution of discounted MDP with linear programming.
"""
pass
class PolicyIteration(MDP):
"""Resolution of discounted MDP with policy iteration algorithm.
"""
pass
class PolicyIterationModified(MDP):
"""Resolution of discounted MDP with modified policy iteration algorithm.
"""
pass
class QLearning(MDP):
"""Evaluation of the matrix Q, using the Q learning algorithm.
"""
def __init__(self, transitions, reward, discount, n_iter=10000):
"""Evaluation of the matrix Q, using the Q learning algorithm
Arguments
-----------------------------------------------------------------------
Let S = number of states, A = number of actions
transitions(SxSxA) = transition matrix
P could be an array with 3 dimensions or a cell array (1xA), each
cell containing a sparse matrix (SxS)
reward(SxSxA) or (SxA) = reward matrix
R could be an array with 3 dimensions (SxSxA) or a cell array
(1xA), each cell containing a sparse matrix (SxS) or a 2D
array(SxA) possibly sparse
discount = discount rate in ]0; 1[
n_iter(optional) = number of iterations to execute.
Default value = 10000; it is an integer greater than the default
value.
Evaluation
-----------------------------------------------------------------------
Q(SxA) = learned Q matrix
value(S) = learned value function.
policy(S) = learned optimal policy.
mean_discrepancy(N/100) = vector of V discrepancy mean over 100 iterations
Then the length of this vector for the default value of N is 100.
"""
# Check of arguments
if (discount <= 0) or (discount >= 1):
raise ValueError("MDP Toolbox Error: Discount rate must be in ]0,1[")
elif (n_iter < 10000):
raise ValueError("MDP Toolbox Error: n_iter must be greater than 10000")
is_mdp, err_msg = self.check(transitions, reward)
if (not is_mdp):
raise TypeError(err_msg)
self.computePR(transitions, reward)
self.discount = discount
self.n_iter = n_iter
# Initialisations
self.Q = zeros((self.S, self.A))
#self.dQ = zeros(self.S, self.A)
self.mean_discrepancy = []
self.discrepancy = zeros((self.S, 100))
self.time = None
def iterate(self):
"""
"""
self.time = time()
# initial state choice
s = randint(0, self.S - 1)
for n in range(self.n_iter):
# Reinitialisation of trajectories every 100 transitions
if ((n % 100) == 0):
s = randint(1, self.S)
# Action choice : greedy with increasing probability
# probability 1-(1/log(n+2)) can be changed
pn = random()
if (pn < (1 - (1 / log(n + 2)))):
optimal_action, a = self.Q[s, ].max()
else:
a = randint(0, self.A - 1)
# Simulating next state s_new and reward associated to <s,s_new,a>
p_s_new = random()
p = 0
s_new = 0
while ((p < p_s_new) and (s_new < s)):
s_new = s_new + 1
if (type(self.P) is object):
p = p + self.P[a][s, s_new]
else:
p = p + self.P[a, s, s_new]
if (type(self.R) is object):
r = self.R[a][s, s_new]
elif (self.R.ndim == 3):
r = self.R(a, s, s_new)
else:
r = self.R(s, a)
# Updating the value of Q
# Decaying update coefficient (1/sqrt(n+2)) can be changed
delta = r + self.discount * self.Q[s_new, ].max() - self.Q[s, a]
dQ = (1 / sqrt(n + 2)) * delta
self.Q[s, a] = self.Q[s, a] + dQ
# current state is updated
s = s_new
# Computing and saving maximal values of the Q variation
self.discrepancy[(n % 100) + 1, ] = abs(dQ)
# Computing means all over maximal Q variations values
if ((n % 100) == 99):
self.mean_discrepancy.append(self.discrepancy.mean(1))
self.discrepancy = zeros((self.S, 100))
# compute the value function and the policy
self.value = self.Q.max(axis=1)
self.policy = self.Q.argmax(axis=1)
self.time = time() - self.time
class RelativeValueIteration(MDP):
"""Resolution of MDP with average reward with relative value iteration
algorithm.
"""
pass
class ValueIteration(MDP):
"""
Resolve a discounted Markov Decision Problem with value iteration.
"""
def __init__(self, transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0, verbose=False):
"""Resolution of discounted MDP with value iteration algorithm.
Arguments --------------------------------------------------------------
Let S = number of states, A = number of actions
P = transition matrix
P could be a numpy ndarray with 3 dimensions (AxSxS) or a
numpy ndarray of dytpe=object with 1 dimenion (1xA), each
element containing a numpy ndarray (SxS) or scipy sparse matrix
R = reward matrix
R could be a numpy ndarray with 3 dimensions (AxSxS) or numpy
ndarray of dtype=object with 1 dimension (1xA), each element
containing a sparse matrix (SxS). R also could be a numpy
ndarray with 2 dimensions (SxA) possibly sparse
discount = discount rate in ]0; 1]
Beware to check conditions of convergence for discount = 1.
epsilon = epsilon-optimal policy search
Greater than 0, optional (default: 0.01).
max_iter = maximum number of iterations to be done.
greater than 0, optional (default: computed)
V0 = starting value function
optional (default: zeros(S,1))
Evaluation -------------------------------------------------------------
V(S) = value function
policy(S) = epsilon-optimal policy
iter = number of done iterations
cpu_time = used CPU time
------------------------------------------------------------------------
Arguments
---------
Let S = number of states, A = number of actions.
transitions = transition matrix
P could be a numpy ndarray with 3 dimensions (AxSxS) or a
numpy ndarray of dytpe=object with 1 dimenion (1xA), each
element containing a numpy ndarray (SxS) or scipy sparse matrix.
reward = reward matrix
R could be a numpy ndarray with 3 dimensions (AxSxS) or numpy
ndarray of dtype=object with 1 dimension (1xA), each element
containing a sparse matrix (SxS). R also could be a numpy
ndarray with 2 dimensions (SxA) possibly sparse.
discount = discount rate in ]0; 1]
Beware to check conditions of convergence for discount = 1.
epsilon = epsilon-optimal policy search
Greater than 0, optional (default: 0.01).
max_iter = maximum number of iterations to be done.
greater than 0, optional (default: computed)
initial_value = starting value function.
optional (default: zeros(S,1)).
Evaluation
----------
value(S) = value function.
policy(S) = epsilon-optimal policy.
iter = number of done iterations.
time = used CPU time.
Notes
-----
In verbose mode, at each iteration, displays the variation of V
and the condition which stopped iterations: epsilon-optimum policy found
or maximum number of iterations reached.
......@@ -315,6 +514,8 @@ class ValueIteration(MDP):
self.thresh = epsilon
self.itr = 0
self.time = None
def boundIter(self, epsilon):
"""Computes a bound for the number of iterations for the value iteration
......@@ -364,10 +565,10 @@ class ValueIteration(MDP):
Vprev = self.value
# Bellman Operator: updates "self.value" and "self.policy"
self.bellmanOperator(Vprev)
self.bellmanOperator()
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# finds the maximum of the the rows. (Operates along the columns?)
variation = self.span(self.value - Vprev)
......@@ -388,5 +589,7 @@ class ValueIteration(MDP):
self.time = time() - self.time
class QLearning():
pass
class ValueIterationGS(MDP):
"""Resolution of discounted MDP with value iteration Gauss-Seidel algorithm.
"""
pass
\ No newline at end of file
......@@ -5,7 +5,7 @@ Created on Sun May 27 23:16:57 2012
@author: -
"""
from mdp import MDP
from mdp import MDP, QLearning
from numpy import array, eye, matrix, ones
from numpy.random import randint
from scipy.sparse import eye as speye
......@@ -71,4 +71,5 @@ def test_check_square_stochastic_array_Rtranspose():
P = array([eye(DIM), eye(DIM)])
R = array([ones(DIM), ones(DIM)])
assert inst.check(P, R) == (True, "R is wrong way")
\ No newline at end of file
inst = QLearning()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment