 ... ... @@ -84,7 +84,11 @@ mdperr = { "PyMDPtoolbox: Number of states S must be greater than 1.", "SA_gt_1" : "PyMDPtoolbox: The number of states S and the number of actions A must be " "greater than 1." "greater than 1.",
    "discount_rng" :
        "PyMDPtoolbox: Discount rate must be in ]0; 1]",
    "maxi_min" :
        "PyMDPtoolbox: The maximum number of iterations must be greater than 0"
}

def exampleForest(S=3, r1=4, r2=2, p=0.1):

def exampleRand(S, A, is_sparse=False, mask=None):

class MDP(object):
    """The Markov Decision Problem Toolbox."""
    
    def __init__(self, transitions, reward, discount, max_iter):
        """"""
        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
        # These are some placeholder attributes that need to be overridden in
        # child classes.
        # S is the number of states
        self.S = None
        # A is the number of actions
        self.A = None
        # R is the reward matrix
        self.R = None
        # P is the probability-transition matrix
        self.P = None
        # policy is the optimal control policy
        self.policy = None
        # value is a vector of expected future values for each state
        self.value = None
        # discount is the per time step discount factor
        if (discount <= 0) or (discount > 1):
            raise ValueError(mdperr["discount_rng"])
        else:
            self.discount = discount
        if (max_iter <= 0):
            raise ValueError(mdperr["maxi_min"])
        else:
            self.max_iter = max_iter
        self.check(transitions, reward)
        self.computePR(transitions, reward)
        # set the initial iteration count to zero
        self.iter = 0
    
    def bellmanOperator(self):
        """
        
        """
        return (W.max() - W.min())
    
    def setSilent(self):
        """Ask for running resolution functions of the MDP Toolbox in silent mode. Then the length of this vector for the default value of N is 100
        (N/100).
        
        Examples
        --------
        >>> import mdp
        >>> P, R = mdp.exampleForest()
        
        """
        
        MDP.__init__(self, transitions, reward, discount, n_iter)
        
        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        if (n_iter < 10000):
            raise ValueError("PyMDPtoolbox: n_iter should be greater than 10000")
        
        # Initialisations
        self.Q = zeros((self.S, self.A))
        
        # initial state choice
        for n in range(self.max_iter):
            # Reinitialisation of trajectories every 100 transitions
            if ((n % 100) == 0):
        
        self.policy = self.Q.argmax(axis=1)
        self.time = time() - self.time
        # rather than report that we have not done any iterations, assign the
        # value of n_iter to self.iter
        self.iter = self.max_iter

class RelativeValueIteration(MDP):
    """Resolution of MDP with average reward with relative value iteration
    algorithm.
    
    """
    pass

class ValueIteration(MDP):
    """ """
    pass

class ValueIteration(MDP):
    """
    
    def __init__(self, transitions, reward, discount, epsilon=0.01,
                 max_iter=1000, initial_value=0):
        """Resolution of discounted MDP with value iteration algorithm.""" MDP.__init__(self, transitions, reward, discount, max_iter)
        
        # initialization of optional arguments
        if (initial_value == 0):
        else:
            if (initial_value.size != self.S):
                raise ValueError("The initial value must be length S")
            self.value = matrix(initial_value)
        
        if (self.discount < 1):
            # compute a bound for the number of iterations and update the
            # stored value of self.max_iter
            self.boundIter(epsilon)
            # computation of threshold of variation for V for an epsilon-
            # optimal policy
            self.thresh = epsilon * (1 - self.discount) / self.discount
        else:  # discount == 1
            # bound for the number of iterations
            self.max_iter = max_iter
            # threshold of variation for V for an epsilon-optimal policy
            self.thresh = epsilon
        
        self.iter = 0
    
    def boundIter(self, epsilon):
        """Computes a bound for the number of iterations for the value iteration
        
        h = zeros(self.S)
        for ss in range(self.S):
            PP = matrix(zeros((self.S, self.A)))
            for aa in range(self.A):
                PP[:, aa] = self.P[aa][:, ss]
            # the function "min()" without any arguments finds the
            h[ss] = PP.min()
        
        k = 1 - h.sum()
        Vprev = self.value
        self.bellmanOperator()
        # p 201, Proposition 6.6.5
        max_iter = log( (epsilon * (1 - self.discount) / self.discount) /
                        self.getSpan(self.value - Vprev) ) / log(self.discount * k)
        self.value = Vprev
        self.max_iter = ceil(max_iter)
    
    def iterate(self):
        """
 def test_exampleRand_dense_shape():
    P, R = exampleRand(STATES, ACTIONS)

def test_exampleRand_dense_check():
    P, R = exampleRand(STATES, ACTIONS)
    assert inst.check(P, R) == None

def test_exampleRand_sparse_shape():
    P, R = exampleRand(STATES, ACTIONS, is_sparse=True)

def test_exampleRand_sparse_check():
    P, R = exampleRand(STATES, ACTIONS, is_sparse=True)
    assert inst.check(P, R) == None

# ValueIteration

def test_ValueIteration():
    assert (inst.policy == (1, 0))
    assert (inst.iter == 26)

def test_ValueIteration_boundIter():
    P = array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]])
    R = array([[5, 10], [-1, 2]])
    inst = ValueIteration(P, R, 0.9, 0.01)
    assert (inst.max_iter == 28)

def test_JacksCarRental():
    S = 21 ** 2
    A = 11
