### Make cosmetic changes to improve style of code

parent 9a04a050
 ... @@ -164,11 +164,11 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): ... @@ -164,11 +164,11 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): rows = list(range(S)) * 2 rows = list(range(S)) * 2 cols =  * S + list(range(1, S)) + [S - 1] cols =  * S + list(range(1, S)) + [S - 1] vals = [p] * S + [1-p] * S vals = [p] * S + [1-p] * S P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr()) P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S, S)).tocsr()) rows = list(range(S)) rows = list(range(S)) cols =  * S cols =  * S vals =  * S vals =  * S P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr()) P.append(_sp.coo_matrix((vals, (rows, cols)), shape=(S, S)).tocsr()) else: else: P = _np.zeros((2, S, S)) P = _np.zeros((2, S, S)) P[0, :, :] = (1 - p) * _np.diag(_np.ones(S - 1), 1) P[0, :, :] = (1 - p) * _np.diag(_np.ones(S - 1), 1) ... @@ -182,7 +182,6 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): ... @@ -182,7 +182,6 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False): R[:, 1] = _np.ones(S) R[:, 1] = _np.ones(S) R[0, 1] = 0 R[0, 1] = 0 R[S - 1, 1] = r2 R[S - 1, 1] = r2 # we want to return the generated transition and reward matrices return(P, R) return(P, R) def rand(S, A, is_sparse=False, mask=None): def rand(S, A, is_sparse=False, mask=None): ... @@ -338,12 +337,11 @@ def rand(S, A, is_sparse=False, mask=None): ... @@ -338,12 +337,11 @@ def rand(S, A, is_sparse=False, mask=None): P[a][s] = P[a][s] / P[a][s].sum() P[a][s] = P[a][s] / P[a][s].sum() R[a][s] = (m * (2 * _np.random.random(S) - R[a][s] = (m * (2 * _np.random.random(S) - _np.ones(S, dtype=int))) _np.ones(S, dtype=int))) # we want to return the generated transition and reward matrices return(P, R) return(P, R) def small(): def small(): """A very small Markov decision process. """A very small Markov decision process. The probability transition matrices are:: The probability transition matrices are:: | | 0.5 0.5 | | | | 0.5 0.5 | | ... @@ -356,7 +354,7 @@ def small(): ... @@ -356,7 +354,7 @@ def small(): R = | 5 10 | R = | 5 10 | | -1 2 | | -1 2 | Returns Returns ======= ======= out : tuple out : tuple ... @@ -378,6 +376,6 @@ def small(): ... @@ -378,6 +376,6 @@ def small(): [-1, 2]]) [-1, 2]]) """ """ P = _np.array([[[0.5, 0.5],[0.8, 0.2]],[[0, 1],[0.1, 0.9]]]) P = _np.array([[[0.5, 0.5], [0.8, 0.2]], [[0, 1], [0.1, 0.9]]]) R = _np.array([[5, 10], [-1, 2]]) R = _np.array([[5, 10], [-1, 2]]) return(P, R) return(P, R)
 ... @@ -246,7 +246,7 @@ class MDP(object): ... @@ -246,7 +246,7 @@ class MDP(object): if P.ndim == 3: if P.ndim == 3: self.S = P.shape self.S = P.shape else: else: self.S = P.shape self.S = P.shape except AttributeError: except AttributeError: self.S = P.shape self.S = P.shape # convert P to a tuple of numpy arrays # convert P to a tuple of numpy arrays ... @@ -281,14 +281,14 @@ class MDP(object): ... @@ -281,14 +281,14 @@ class MDP(object): self.R = tuple(r for aa in range(self.A)) self.R = tuple(r for aa in range(self.A)) elif R.ndim == 2: elif R.ndim == 2: self.R = tuple(_np.array(R[:, aa]).reshape(self.S) self.R = tuple(_np.array(R[:, aa]).reshape(self.S) for aa in range(self.A)) for aa in range(self.A)) else: else: self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) for aa in range(self.A)) for aa in range(self.A)) except AttributeError: except AttributeError: if len(R) == self.A: if len(R) == self.A: self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) self.R = tuple(_np.multiply(P[aa], R[aa]).sum(1).reshape(self.S) for aa in range(self.A)) for aa in range(self.A)) else: else: r = _np.array(R).reshape(self.S) r = _np.array(R).reshape(self.S) self.R = tuple(r for aa in range(self.A)) self.R = tuple(r for aa in range(self.A)) ... @@ -375,8 +375,6 @@ class FiniteHorizon(MDP): ... @@ -375,8 +375,6 @@ class FiniteHorizon(MDP): # Set the reward for the final transition to h, if specified. # Set the reward for the final transition to h, if specified. if h is not None: if h is not None: self.V[:, N] = h self.V[:, N] = h # Call the iteration method #self.run() def run(self): def run(self): # Run the finite horizon algorithm. # Run the finite horizon algorithm. ... @@ -459,8 +457,6 @@ class LP(MDP): ... @@ -459,8 +457,6 @@ class LP(MDP): # this doesn't do what I want it to do c.f. issue #3 # this doesn't do what I want it to do c.f. issue #3 if not self.verbose: if not self.verbose: solvers.options['show_progress'] = False solvers.options['show_progress'] = False # Call the iteration method #self.run() def run(self): def run(self): #Run the linear programming algorithm. #Run the linear programming algorithm. ... @@ -488,7 +484,7 @@ class LP(MDP): ... @@ -488,7 +484,7 @@ class LP(MDP): # only to 10e-8 places. This assumes glpk is installed of course. # only to 10e-8 places. This assumes glpk is installed of course. self.V = _np.array(self._linprog(f, M, -h)['x']).reshape(self.S) self.V = _np.array(self._linprog(f, M, -h)['x']).reshape(self.S) # apply the Bellman operator # apply the Bellman operator self.policy, self.V = self._bellmanOperator() self.policy, self.V = self._bellmanOperator() # update the time spent solving # update the time spent solving self.time = _time.time() - self.time self.time = _time.time() - self.time # store value and policy as tuples # store value and policy as tuples ... @@ -560,7 +556,7 @@ class PolicyIteration(MDP): ... @@ -560,7 +556,7 @@ class PolicyIteration(MDP): # Set up the MDP, but don't need to worry about epsilon values # Set up the MDP, but don't need to worry about epsilon values MDP.__init__(self, transitions, reward, discount, None, max_iter) MDP.__init__(self, transitions, reward, discount, None, max_iter) # Check if the user has supplied an initial policy. If not make one. # Check if the user has supplied an initial policy. If not make one. if policy0 == None: if policy0 is None: # Initialise the policy to the one which maximises the expected # Initialise the policy to the one which maximises the expected # immediate reward # immediate reward null = _np.zeros(self.S) null = _np.zeros(self.S) ... @@ -592,8 +588,6 @@ class PolicyIteration(MDP): ... @@ -592,8 +588,6 @@ class PolicyIteration(MDP): raise ValueError("'eval_type' should be '0' for matrix evaluation " raise ValueError("'eval_type' should be '0' for matrix evaluation " "or '1' for iterative evaluation. The strings " "or '1' for iterative evaluation. The strings " "'matrix' and 'iterative' can also be used.") "'matrix' and 'iterative' can also be used.") # Call the iteration method #self.run() def _computePpolicyPRpolicy(self): def _computePpolicyPRpolicy(self): # Compute the transition matrix and the reward matrix for a policy. # Compute the transition matrix and the reward matrix for a policy. ... @@ -768,7 +762,7 @@ class PolicyIteration(MDP): ... @@ -768,7 +762,7 @@ class PolicyIteration(MDP): done = True done = True if self.verbose: if self.verbose: print(_MSG_STOP_UNCHANGING_POLICY) print(_MSG_STOP_UNCHANGING_POLICY) elif (self.iter == self.max_iter): elif self.iter == self.max_iter: done = True done = True if self.verbose: if self.verbose: print(_MSG_STOP_MAX_ITER) print(_MSG_STOP_MAX_ITER) ... @@ -857,9 +851,6 @@ class PolicyIterationModified(PolicyIteration): ... @@ -857,9 +851,6 @@ class PolicyIterationModified(PolicyIteration): Rmin = min(R.min() for R in self.R) Rmin = min(R.min() for R in self.R) self.V = 1 / (1 - self.discount) * Rmin * _np.ones((self.S,)) self.V = 1 / (1 - self.discount) * Rmin * _np.ones((self.S,)) # Call the iteration method #self.run() def run(self): def run(self): # Run the modified policy iteration algorithm. # Run the modified policy iteration algorithm. ... @@ -991,9 +982,6 @@ class QLearning(MDP): ... @@ -991,9 +982,6 @@ class QLearning(MDP): self.Q = _np.zeros((self.S, self.A)) self.Q = _np.zeros((self.S, self.A)) self.mean_discrepancy = [] self.mean_discrepancy = [] # Call the iteration method #self.run() def run(self): def run(self): # Run the Q-learning algoritm. # Run the Q-learning algoritm. discrepancy = [] discrepancy = [] ... @@ -1006,13 +994,13 @@ class QLearning(MDP): ... @@ -1006,13 +994,13 @@ class QLearning(MDP): for n in range(1, self.max_iter + 1): for n in range(1, self.max_iter + 1): # Reinitialisation of trajectories every 100 transitions # Reinitialisation of trajectories every 100 transitions if ((n % 100) == 0): if (n % 100) == 0: s = _np.random.randint(0, self.S) s = _np.random.randint(0, self.S) # Action choice : greedy with increasing probability # Action choice : greedy with increasing probability # probability 1-(1/log(n+2)) can be changed # probability 1-(1/log(n+2)) can be changed pn = _np.random.random() pn = _np.random.random() if (pn < (1 - (1 / _math.log(n + 2)))): if pn < (1 - (1 / _math.log(n + 2))): # optimal_action = self.Q[s, :].max() # optimal_action = self.Q[s, :].max() a = self.Q[s, :].argmax() a = self.Q[s, :].argmax() else: else: ... @@ -1022,7 +1010,7 @@ class QLearning(MDP): ... @@ -1022,7 +1010,7 @@ class QLearning(MDP): p_s_new = _np.random.random() p_s_new = _np.random.random() p = 0 p = 0 s_new = -1 s_new = -1 while ((p < p_s_new) and (s_new < (self.S - 1))): while (p < p_s_new) and (s_new < (self.S - 1)): s_new = s_new + 1 s_new = s_new + 1 p = p + self.P[a][s, s_new] p = p + self.P[a][s, s_new] ... @@ -1139,9 +1127,6 @@ class RelativeValueIteration(MDP): ... @@ -1139,9 +1127,6 @@ class RelativeValueIteration(MDP): self.average_reward = None self.average_reward = None # Call the iteration method #self.run() def run(self): def run(self): # Run the relative value iteration algorithm. # Run the relative value iteration algorithm. ... @@ -1153,7 +1138,7 @@ class RelativeValueIteration(MDP): ... @@ -1153,7 +1138,7 @@ class RelativeValueIteration(MDP): while not done: while not done: self.iter += 1; self.iter += 1 self.policy, Vnext = self._bellmanOperator() self.policy, Vnext = self._bellmanOperator() Vnext = Vnext - self.gain Vnext = Vnext - self.gain ... @@ -1164,15 +1149,15 @@ class RelativeValueIteration(MDP): ... @@ -1164,15 +1149,15 @@ class RelativeValueIteration(MDP): print((" %s\t\t %s" % (self.iter, variation))) print((" %s\t\t %s" % (self.iter, variation))) if variation < self.epsilon: if variation < self.epsilon: done = True done = True self.average_reward = self.gain + (Vnext - self.V).min() self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: if self.verbose: print(_MSG_STOP_EPSILON_OPTIMAL_POLICY) print(_MSG_STOP_EPSILON_OPTIMAL_POLICY) elif self.iter == self.max_iter: elif self.iter == self.max_iter: done = True done = True self.average_reward = self.gain + (Vnext - self.V).min() self.average_reward = self.gain + (Vnext - self.V).min() if self.verbose: if self.verbose: print(_MSG_STOP_MAX_ITER) print(_MSG_STOP_MAX_ITER) self.V = Vnext self.V = Vnext self.gain = float(self.V[self.S - 1]) self.gain = float(self.V[self.S - 1]) ... @@ -1320,9 +1305,6 @@ class ValueIteration(MDP): ... @@ -1320,9 +1305,6 @@ class ValueIteration(MDP): # threshold of variation for V for an epsilon-optimal policy # threshold of variation for V for an epsilon-optimal policy self.thresh = epsilon self.thresh = epsilon # Call the iteration method #self.run() def _boundIter(self, epsilon): def _boundIter(self, epsilon): # Compute a bound for the number of iterations. # Compute a bound for the number of iterations. # # ... @@ -1395,7 +1377,7 @@ class ValueIteration(MDP): ... @@ -1395,7 +1377,7 @@ class ValueIteration(MDP): if self.verbose: if self.verbose: print(_MSG_STOP_EPSILON_OPTIMAL_POLICY) print(_MSG_STOP_EPSILON_OPTIMAL_POLICY) break break elif (self.iter == self.max_iter): elif self.iter == self.max_iter: if self.verbose: if self.verbose: print(_MSG_STOP_MAX_ITER) print(_MSG_STOP_MAX_ITER) break break ... @@ -1491,9 +1473,6 @@ class ValueIterationGS(ValueIteration): ... @@ -1491,9 +1473,6 @@ class ValueIterationGS(ValueIteration): # threshold of variation for V for an epsilon-optimal policy # threshold of variation for V for an epsilon-optimal policy self.thresh = epsilon self.thresh = epsilon # Call the iteration method #self.run() def run(self): def run(self): # Run the value iteration Gauss-Seidel algorithm. # Run the value iteration Gauss-Seidel algorithm. ... @@ -1534,7 +1513,7 @@ class ValueIterationGS(ValueIteration): ... @@ -1534,7 +1513,7 @@ class ValueIterationGS(ValueIteration): for s in range(self.S): for s in range(self.S): Q = _np.zeros(self.A) Q = _np.zeros(self.A) for a in range(self.A): for a in range(self.A): Q[a] = self.R[a][s] + self.discount * self.P[a][s,:].dot(self.V) Q[a] = self.R[a][s] + self.discount * self.P[a][s, :].dot(self.V) self.V[s] = Q.max() self.V[s] = Q.max() self.policy.append(int(Q.argmax())) self.policy.append(int(Q.argmax())) ... ...
 ... @@ -19,12 +19,12 @@ getSpan ... @@ -19,12 +19,12 @@ getSpan # Copyright (c) 2011-2013 Steven A. W. Cordwell # Copyright (c) 2011-2013 Steven A. W. Cordwell # Copyright (c) 2009 INRA # Copyright (c) 2009 INRA # # # All rights reserved. # All rights reserved. # # # Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # modification, are permitted provided that the following conditions are met: # # # * Redistributions of source code must retain the above copyright notice, # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # * Redistributions in binary form must reproduce the above copyright notice, ... @@ -33,7 +33,7 @@ getSpan ... @@ -33,7 +33,7 @@ getSpan # * Neither the name of the nor the names of its contributors # * Neither the name of the nor the names of its contributors # may be used to endorse or promote products derived from this software # may be used to endorse or promote products derived from this software # without specific prior written permission. # without specific prior written permission. # # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ... @@ -49,7 +49,7 @@ getSpan ... @@ -49,7 +49,7 @@ getSpan import numpy as _np import numpy as _np # These need to be fixed so that we use classes derived from Error. # These need to be fixed so that we use classes derived from Error. mdperr = { MDPERR = { "mat_nonneg" : "mat_nonneg" : "Transition probabilities must be non-negative.", "Transition probabilities must be non-negative.", "mat_square" : "mat_square" : ... @@ -84,9 +84,9 @@ mdperr = { ... @@ -84,9 +84,9 @@ mdperr = { def check(P, R): def check(P, R): """Check if ``P`` and ``R`` define a valid Markov Decision Process (MDP). """Check if ``P`` and ``R`` define a valid Markov Decision Process (MDP). Let ``S`` = number of states, ``A`` = number of actions. Let ``S`` = number of states, ``A`` = number of actions. Parameters Parameters --------- --------- P : array P : array ... @@ -99,18 +99,18 @@ def check(P, R): ... @@ -99,18 +99,18 @@ def check(P, R): shape of (S, A, A). It can also be a one dimensional array with a shape of (S, A, A). It can also be a one dimensional array with a shape of (A, ), where each element contains matrix with a shape of shape of (A, ), where each element contains matrix with a shape of (S, S) which can possibly be sparse. It can also be an array with (S, S) which can possibly be sparse. It can also be an array with a shape of (S, A) which can possibly be sparse. a shape of (S, A) which can possibly be sparse. Notes Notes ----- ----- Raises an error if ``P`` and ``R`` do not define a MDP. Raises an error if ``P`` and ``R`` do not define a MDP. Examples Examples -------- -------- >>> import mdptoolbox, mdptoolbox.example >>> import mdptoolbox, mdptoolbox.example >>> P_valid, R_valid = mdptoolbox.example.rand(100, 5) >>> P_valid, R_valid = mdptoolbox.example.rand(100, 5) >>> mdptoolbox.util.check(P_valid, R_valid) # Nothing should happen >>> mdptoolbox.util.check(P_valid, R_valid) # Nothing should happen >>> >>> >>> import numpy as np >>> import numpy as np >>> P_invalid = np.random.rand(5, 100, 100) >>> P_invalid = np.random.rand(5, 100, 100) >>> mdptoolbox.util.check(P_invalid, R_valid) # Raises an exception >>> mdptoolbox.util.check(P_invalid, R_valid) # Raises an exception ... @@ -128,7 +128,7 @@ def check(P, R): ... @@ -128,7 +128,7 @@ def check(P, R): # continue checking from there # continue checking from there raise AttributeError raise AttributeError else: else: raise InvalidMDPError(mdperr["P_shape"]) raise InvalidMDPError(MDPERR["P_shape"]) except AttributeError: except AttributeError: try: try: aP = len(P) aP = len(P) ... @@ -136,9 +136,9 @@ def check(P, R): ... @@ -136,9 +136,9 @@ def check(P, R): for aa in range(1, aP): for aa in range(1, aP): sP0aa, sP1aa = P[aa].shape sP0aa, sP1aa = P[aa].shape if (sP0aa != sP0) or (sP1aa != sP1): if (sP0aa != sP0) or (sP1aa != sP1): raise InvalidMDPError(mdperr["obj_square"]) raise InvalidMDPError(MDPERR["obj_square"]) except AttributeError: except AttributeError: raise InvalidMDPError(mdperr["P_shape"]) raise InvalidMDPError(MDPERR["P_shape"]) # Checking R # Checking R try: try: ndimR = R.ndim ndimR = R.ndim ... @@ -151,7 +151,7 @@ def check(P, R): ... @@ -151,7 +151,7 @@ def check(P, R): elif ndimR == 3: elif ndimR == 3: aR, sR0, sR1 = R.shape aR, sR0, sR1 = R.shape else: else: raise InvalidMDPError(mdperr["R_shape"]) raise InvalidMDPError(MDPERR["R_shape"]) except AttributeError: except AttributeError: try: try: lenR = len(R) lenR = len(R) ... @@ -160,15 +160,15 @@ def check(P, R): ... @@ -160,15 +160,15 @@ def check(P, R): sR0, sR1 = R.shape sR0, sR1 = R.shape for aa in range(1, aR): for aa in range(1, aR): sR0aa, sR1aa = R[aa].shape sR0aa, sR1aa = R[aa].shape if ((sR0aa != sR0) or (sR1aa != sR1)): if (sR0aa != sR0) or (sR1aa != sR1): raise InvalidMDPError(mdperr["obj_square"]) raise InvalidMDPError(MDPERR["obj_square"]) elif lenR == sP0: elif lenR == sP0: aR = aP aR = aP sR0 = sR1 = lenR sR0 = sR1 = lenR else: else: raise InvalidMDPError(mdperr["R_shape"]) raise InvalidMDPError(MDPERR["R_shape"]) except AttributeError: except AttributeError: raise InvalidMDPError(mdperr["R_shape"]) raise InvalidMDPError(MDPERR["R_shape"]) # Checking dimensions # Checking dimensions assert sP0 > 0, "The number of states in P must be greater than 0." assert sP0 > 0, "The number of states in P must be greater than 0." assert aP > 0, "The number of actions in P must be greater than 0." assert aP > 0, "The number of actions in P must be greater than 0." ... @@ -183,13 +183,12 @@ def check(P, R): ... @@ -183,13 +183,12 @@ def check(P, R): checkSquareStochastic(P[aa]) checkSquareStochastic(P[aa]) # We are at the end of the checks, so if no exceptions have been raised # We are at the end of the checks, so if no exceptions have been raised # then that means there are (hopefullly) no errors and we return None # then that means there are (hopefullly) no errors and we return None return None # These are the old code comments, which need to be converted to # These are the old code comments, which need to be converted to # information in the docstring: # information in the docstring: # # # tranitions must be a numpy array either an AxSxS ndarray (with any # tranitions must be a numpy array either an AxSxS ndarray (with any # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, # dtype other than "object"); or, a 1xA ndarray with a "object" dtype, # and each element containing an SxS array. An AxSxS array will be # and each element containing an SxS array. An AxSxS array will be # be converted to an object array. A numpy object array is similar to a # be converted to an object array. A numpy object array is similar to a # MATLAB cell array. # MATLAB cell array. ... @@ -208,7 +207,7 @@ def check(P, R): ... @@ -208,7 +207,7 @@ def check(P, R): # As above but for the reward array. A difference is that the reward # As above but for the reward array. A difference is that the reward # array can have either two or 3 dimensions. # array can have either two or 3 dimensions. # # # We want to make sure that the transition probability array and the # We want to make sure that the transition probability array and the # reward array are in agreement. This means that both should show that # reward array are in agreement. This means that both should show that # there are the same number of actions and the same number of states. # there are the same number of actions and the same number of states. # Furthermore the probability of transition matrices must be SxS in # Furthermore the probability of transition matrices must be SxS in ... @@ -238,7 +237,7 @@ def check(P, R): ... @@ -238,7 +237,7 @@ def check(P, R): # telling the user what needs to be fixed. # telling the user what needs to be fixed. # # # if we are using a normal array for this, then the first # if we are using a normal array for this, then the first # dimension should be the number of actions, and the second and # dimension should be the number of actions, and the second and # third should be the number of states # third should be the number of states # # # the first dimension of the transition matrix must report the same # the first dimension of the transition matrix must report the same ... @@ -253,14 +252,14 @@ def check(P, R): ... @@ -253,14 +252,14 @@ def check(P, R): # normal arrays this is a matrix formed by taking a slice of the array # normal arrays this is a matrix formed by taking a slice of the array # # # if the rewarad array has an object dtype, then we check that # if the rewarad array has an object dtype, then we check that # each element contains a matrix of the same shape as we did # each element contains a matrix of the same shape as we did # above with the transition array. # above with the transition array. # # # This indicates that the reward matrices are constructed per # This indicates that the reward matrices are constructed per # transition, so that the first dimension is the actions and # transition, so that the first dimension is the actions and # the second two dimensions are the states. # the second two dimensions are the states. # # # then the reward matrix is per state, so the first dimension is # then the reward matrix is per state, so the first dimension is # the states and the second dimension is the actions. # the states and the second dimension is the actions.