Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
60e89bf1
Commit
60e89bf1
authored
Jan 21, 2013
by
Steven Cordwell
Browse files
various small fixes
parent
8757cec0
Changes
2
Hide whitespace changes
Inline
Side-by-side
mdp.py
View file @
60e89bf1
...
@@ -240,28 +240,36 @@ class MDP(object):
...
@@ -240,28 +240,36 @@ class MDP(object):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
):
""""""
""""""
# the verbosity is by default turned off
self
.
verbose
=
False
# Initially the time taken to perform the computations is set to None
if
(
type
(
discount
)
is
int
)
or
(
type
(
discount
)
is
float
):
self
.
time
=
None
if
(
discount
<=
0
)
or
(
discount
>
1
):
raise
ValueError
(
mdperr
[
"discount_rng"
])
if
(
discount
<=
0
)
or
(
discount
>
1
)
:
else
:
raise
ValueError
(
mdperr
[
"discount_rng"
])
self
.
discount
=
discount
el
s
e
:
el
if
not
discount
is
Non
e
:
self
.
discount
=
discount
raise
ValueError
(
"PyMDPtoolbox: the discount must be a positive real number less than or equal to one."
)
if
(
max_iter
<=
0
):
if
(
type
(
max_iter
)
is
int
)
or
(
type
(
max_iter
)
is
float
):
raise
ValueError
(
mdperr
[
"maxi_min"
])
if
(
max_iter
<=
0
):
else
:
raise
ValueError
(
mdperr
[
"maxi_min"
])
self
.
max_iter
=
max_iter
else
:
self
.
max_iter
=
max_iter
elif
not
max_iter
is
None
:
raise
ValueError
(
"PyMDPtoolbox: max_iter must be a positive real number greater than zero."
)
self
.
check
(
transitions
,
reward
)
self
.
check
(
transitions
,
reward
)
self
.
computePR
(
transitions
,
reward
)
self
.
computePR
(
transitions
,
reward
)
# the verbosity is by default turned off
self
.
verbose
=
False
# Initially the time taken to perform the computations is set to None
self
.
time
=
None
# set the initial iteration count to zero
# set the initial iteration count to zero
self
.
iter
=
0
self
.
iter
=
0
self
.
value
=
None
self
.
policy
=
None
def
bellmanOperator
(
self
):
def
bellmanOperator
(
self
):
"""
"""
...
@@ -448,12 +456,14 @@ class MDP(object):
...
@@ -448,12 +456,14 @@ class MDP(object):
def
checkSquareStochastic
(
self
,
Z
):
def
checkSquareStochastic
(
self
,
Z
):
"""Check if Z is a square stochastic matrix
"""Check if Z is a square stochastic matrix
Arguments
Parameters
--------------------------------------------------------------
----------
Z = a numpy ndarray SxS, possibly sparse (csr_matrix)
Z : a SxS matrix. It could be a numpy ndarray SxS, or a scipy.sparse
csr_matrix
Evaluation
Evaluation
----------
---------------------------------------------------
----------
error_msg = error message or None if corr
ect
Returns None if no error has been det
ect
ed
"""
"""
s1
,
s2
=
Z
.
shape
s1
,
s2
=
Z
.
shape
if
(
s1
!=
s2
):
if
(
s1
!=
s2
):
...
@@ -582,33 +592,30 @@ class FiniteHorizon(MDP):
...
@@ -582,33 +592,30 @@ class FiniteHorizon(MDP):
"""
"""
def
__init__
(
self
,
P
,
R
,
discount
,
N
,
h
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
N
,
h
=
None
):
""""""
if
N
<
1
:
if
N
<
1
:
raise
ValueError
(
'MDP Toolbox ERROR: N must be upper than 0'
)
raise
ValueError
(
'MDP Toolbox ERROR: N must be upper than 0'
)
if
discount
<=
0
or
discount
>
1
:
raise
ValueError
(
'MDP Toolbox ERROR: Discount rate must be in ]0; 1]'
)
if
iscell
(
P
):
S
=
size
(
P
[
1
],
1
)
else
:
else
:
S
=
size
(
P
,
1
)
self
.
N
=
N
V
=
zeros
(
S
,
N
+
1
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
)
if
nargin
==
5
:
self
.
value
=
zeros
(
self
.
S
,
N
+
1
)
V
[:,
N
+
1
]
=
h
PR
=
mdp_computePR
(
P
,
R
);
if
not
h
is
None
:
self
.
value
[:,
N
+
1
]
=
h
def
iterate
():
def
iterate
(
self
):
""""""
self
.
time
=
time
()
self
.
time
=
time
()
for
n
in
range
(
N
-
1
):
for
n
in
range
(
self
.
N
-
1
):
W
,
X
=
self
.
bellmanOperator
(
P
,
PR
,
discount
,
V
[:,
N
-
n
+
1
])
W
,
X
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
self
.
discount
,
self
.
value
[:,
self
.
N
-
n
+
1
])
V
[:,
N
-
n
]
=
W
self
.
value
[:,
self
.
N
-
n
]
=
W
policy
[:,
N
-
n
]
=
X
self
.
policy
[:,
self
.
N
-
n
]
=
X
#
if
mdp_VERBOSE
if
self
.
verbose
:
#
disp(['stage:' num2str(N-n) ' policy transpose : ' num2str(
policy
(
:,
N-n)')]
)
print
(
"stage: %s ... policy transpose : %s"
)
%
(
self
.
N
-
n
,
self
.
policy
[
:,
self
.
N
-
n
].
T
)
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
...
@@ -642,26 +649,18 @@ class LP(MDP):
...
@@ -642,26 +649,18 @@ class LP(MDP):
--------
--------
"""
"""
def
__init__
(
self
,
P
,
R
,
discount
):
def
__init__
(
self
,
transitions
,
reward
,
discount
):
""""""
try
:
try
:
from
cvxopt
import
matrix
,
solvers
from
cvxopt
import
matrix
,
solvers
self
.
linprog
=
solvers
.
lp
except
ImportError
:
except
ImportError
:
raise
ImportError
(
"The python module cvxopt is required to use linear programming functionality."
)
raise
ImportError
(
"The python module cvxopt is required to use linear programming functionality."
)
self
.
linprog
=
solvers
.
lp
from
scipy.sparse
import
eye
as
speye
if
discount
<=
0
or
discount
>=
1
:
print
(
'MDP Toolbox ERROR: Discount rate must be in ]0; 1['
)
if
iscell
(
P
):
S
=
size
(
P
[
1
],
1
)
A
=
length
(
P
)
else
:
S
=
size
(
P
,
1
)
A
=
size
(
P
,
3
)
PR
=
self
.
computePR
(
P
,
R
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
)
# The objective is to resolve : min V / V >= PR + discount*P*V
# The objective is to resolve : min V / V >= PR + discount*P*V
# The function linprog of the optimisation Toolbox of Mathworks resolves :
# The function linprog of the optimisation Toolbox of Mathworks resolves :
...
@@ -669,23 +668,22 @@ class LP(MDP):
...
@@ -669,23 +668,22 @@ class LP(MDP):
# So the objective could be expressed as : min V / (discount*P-I) * V <= - PR
# So the objective could be expressed as : min V / (discount*P-I) * V <= - PR
# To avoid loop on states, the matrix M is structured following actions M(A*S,S)
# To avoid loop on states, the matrix M is structured following actions M(A*S,S)
f
=
ones
(
S
,
1
)
self
.
f
=
ones
(
self
.
S
,
1
)
M
=
[]
self
.
M
=
zeros
((
self
.
A
*
self
.
S
,
self
.
S
))
if
iscell
(
P
):
for
aa
in
range
(
self
.
A
):
for
a
in
range
(
A
):
pos
=
(
aa
+
1
)
*
self
.
S
M
=
hstack
((
M
,
discount
*
P
[
a
]
-
speye
(
S
)))
self
.
M
[(
pos
-
self
.
S
):
pos
,
:]
=
discount
*
self
.
P
[
aa
]
-
speye
(
self
.
S
,
self
.
S
)
else
:
for
a
in
range
(
A
):
M
=
hstack
((
M
,
discount
*
P
[:,:,
a
]
-
speye
(
S
)))
def
iterate
(
self
,
linprog
):
self
.
M
=
matrix
(
self
.
M
)
def
iterate
(
self
):
""""""
self
.
time
=
time
()
self
.
time
=
time
()
V
=
self
.
linprog
(
f
,
M
,
-
P
R
)
self
.
value
=
self
.
linprog
(
self
.
f
,
self
.
M
,
-
self
.
R
)
V
,
policy
=
self
.
bellmanOperator
(
P
,
PR
,
discount
,
V
)
self
.
value
,
self
.
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
self
.
discount
,
self
.
value
)
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
...
@@ -808,15 +806,11 @@ class PolicyIterationModified(MDP):
...
@@ -808,15 +806,11 @@ class PolicyIterationModified(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
):
""""""
""""""
MDP
.
__init__
(
self
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
if
epsilon
<=
0
:
if
epsilon
<=
0
:
raise
ValueError
(
"epsilon must be greater than 0"
)
raise
ValueError
(
"epsilon must be greater than 0"
)
self
.
check
(
transitions
,
reward
)
self
.
computePR
(
transitions
,
reward
)
# computation of threshold of variation for V for an epsilon-optimal policy
# computation of threshold of variation for V for an epsilon-optimal policy
if
self
.
discount
!=
1
:
if
self
.
discount
!=
1
:
self
.
thresh
=
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
self
.
thresh
=
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
...
@@ -827,9 +821,11 @@ class PolicyIterationModified(MDP):
...
@@ -827,9 +821,11 @@ class PolicyIterationModified(MDP):
self
.
value
=
matrix
(
zeros
((
self
.
S
,
1
)))
self
.
value
=
matrix
(
zeros
((
self
.
S
,
1
)))
else
:
else
:
# min(min()) is not right
# min(min()) is not right
self
.
value
=
1
/
(
1
-
discount
)
*
min
(
min
(
self
.
PR
))
*
ones
((
self
.
S
,
1
))
self
.
value
=
1
/
(
1
-
discount
)
*
min
(
min
(
self
.
R
))
*
ones
((
self
.
S
,
1
))
self
.
iter
=
0
def
evalPolicyIterative
(
self
):
""""""
pass
def
iterate
(
self
):
def
iterate
(
self
):
""""""
""""""
...
@@ -838,7 +834,7 @@ class PolicyIterationModified(MDP):
...
@@ -838,7 +834,7 @@ class PolicyIterationModified(MDP):
print
(
' Iteration V_variation'
)
print
(
' Iteration V_variation'
)
self
.
time
=
time
()
self
.
time
=
time
()
done
=
False
done
=
False
while
not
done
:
while
not
done
:
self
.
iter
=
self
.
iter
+
1
self
.
iter
=
self
.
iter
+
1
...
@@ -846,23 +842,23 @@ class PolicyIterationModified(MDP):
...
@@ -846,23 +842,23 @@ class PolicyIterationModified(MDP):
Vnext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
PR
,
self
.
discount
,
self
.
V
)
Vnext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
PR
,
self
.
discount
,
self
.
V
)
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
#[Ppolicy, PRpolicy] = mdp_computePpolicyPRpolicy(P, PR, policy);
variation
=
self
.
getSpan
(
Vnext
-
V
);
variation
=
self
.
getSpan
(
Vnext
-
self
.
value
);
if
self
.
verbose
:
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
V
=
Vnext
self
.
value
=
Vnext
if
variation
<
thresh
:
if
variation
<
self
.
thresh
:
done
=
True
done
=
True
else
:
else
:
is_verbose
=
False
is_verbose
=
False
if
self
.
verbose
:
if
self
.
verbose
:
self
.
verbose
=
False
self
.
setSilent
is_verbose
=
True
is_verbose
=
True
V
=
self
.
evalPolicyIterative
(
self
.
P
,
self
.
PR
,
self
.
discount
,
self
.
policy
,
self
.
V
,
self
.
epsilon
,
self
.
max_iter
)
self
.
value
=
self
.
evalPolicyIterative
()
if
is_verbose
:
if
is_verbose
:
self
.
v
erbose
=
True
self
.
setV
erbose
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
...
@@ -1055,13 +1051,8 @@ class RelativeValueIteration(MDP):
...
@@ -1055,13 +1051,8 @@ class RelativeValueIteration(MDP):
if
epsilon
<=
0
:
if
epsilon
<=
0
:
print
(
'MDP Toolbox ERROR: epsilon must be upper than 0'
)
print
(
'MDP Toolbox ERROR: epsilon must be upper than 0'
)
if
iscell
(
P
):
self
.
U
=
zeros
(
self
.
S
,
1
)
S
=
size
(
P
[
1
],
1
)
self
.
gain
=
self
.
U
[
self
.
S
]
else
:
S
=
size
(
P
,
1
)
self
.
U
=
zeros
(
S
,
1
)
self
.
gain
=
U
(
S
)
def
iterate
(
self
):
def
iterate
(
self
):
""""""
""""""
...
@@ -1076,7 +1067,7 @@ class RelativeValueIteration(MDP):
...
@@ -1076,7 +1067,7 @@ class RelativeValueIteration(MDP):
self
.
iter
=
self
.
iter
+
1
;
self
.
iter
=
self
.
iter
+
1
;
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
P
R
,
1
,
self
.
U
)
Unext
,
policy
=
self
.
bellmanOperator
(
self
.
P
,
self
.
R
,
1
,
self
.
U
)
Unext
=
Unext
-
self
.
gain
Unext
=
Unext
-
self
.
gain
variation
=
self
.
getSpan
(
Unext
-
self
.
U
)
variation
=
self
.
getSpan
(
Unext
-
self
.
U
)
...
@@ -1320,7 +1311,7 @@ class ValueIteration(MDP):
...
@@ -1320,7 +1311,7 @@ class ValueIteration(MDP):
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
class
ValueIterationGS
(
MDP
):
class
ValueIterationGS
(
ValueIteration
):
"""Resolution of discounted MDP with value iteration Gauss-Seidel algorithm
"""Resolution of discounted MDP with value iteration Gauss-Seidel algorithm
Arguments
Arguments
...
@@ -1359,7 +1350,7 @@ class ValueIterationGS(MDP):
...
@@ -1359,7 +1350,7 @@ class ValueIterationGS(MDP):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
,
initial_value
=
0
):
def
__init__
(
self
,
transitions
,
reward
,
discount
,
epsilon
=
0.01
,
max_iter
=
10
,
initial_value
=
0
):
""""""
""""""
MDP
.
__init__
(
self
,
discount
,
max_iter
)
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
max_iter
)
# initialization of optional arguments
# initialization of optional arguments
if
(
initial_value
==
0
):
if
(
initial_value
==
0
):
...
@@ -1371,57 +1362,25 @@ class ValueIterationGS(MDP):
...
@@ -1371,57 +1362,25 @@ class ValueIterationGS(MDP):
if
epsilon
<=
0
:
if
epsilon
<=
0
:
raise
ValueError
(
"epsilon must be greater than 0"
)
raise
ValueError
(
"epsilon must be greater than 0"
)
#if discount == 1
if
discount
==
1
:
# disp('--------------------------------------------------------')
print
(
'PyMDPtoolbox WARNING: check conditions of convergence.'
# disp('MDP Toolbox WARNING: check conditions of convergence.')
'With no discount, convergence is not always assumed.'
)
# disp('With no discount, convergence is not always assumed.')
# disp('--------------------------------------------------------')
#end;
PR
=
self
.
computePR
(
P
,
R
)
#% initialization of optional arguments
#if nargin < 6; V0 = zeros(S,1); end;
#if nargin < 4; epsilon = 0.01; end;
#% compute a bound for the number of iterations
#if discount ~= 1
# computed_max_iter = mdp_value_iteration_bound_iter(P, R, discount, epsilon, V0);
#end;
#if nargin < 5
# if discount ~= 1
# max_iter = computed_max_iter;
# else
# max_iter = 1000;
# end;
#else
# if discount ~= 1 && max_iter > computed_max_iter
# disp(['MDP Toolbox WARNING: max_iter is bounded by ' num2str(computed_max_iter,'%12.1f') ])
# max_iter = computed_max_iter;
# end;
#end;
#% computation of threshold of variation for V for an epsilon-optimal policy
#if discount ~= 1
# thresh = epsilon * (1-discount)/discount
#else
# thresh = epsilon
self
.
discount
=
discount
if
(
discount
<
1
):
if
(
discount
<
1
):
# compute a bound for the number of iterations
# compute a bound for the number of iterations
#self.max_iter =
self.boundIter(epsilon)
self
.
boundIter
(
epsilon
)
self
.
max_iter
=
5000
print
(
'MDP Toolbox WARNING: max_iter is bounded by %s'
)
%
self
.
max_iter
# computation of threshold of variation for V for an epsilon-optimal policy
# computation of threshold of variation for V for an epsilon-optimal policy
self
.
thresh
=
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
self
.
thresh
=
epsilon
*
(
1
-
self
.
discount
)
/
self
.
discount
else
:
# discount == 1
else
:
# discount == 1
# bound for the number of iterations
self
.
max_iter
=
max_iter
# threshold of variation for V for an epsilon-optimal policy
# threshold of variation for V for an epsilon-optimal policy
self
.
thresh
=
epsilon
self
.
thresh
=
epsilon
self
.
iter
=
0
self
.
iter
=
0
def
iterate
(
self
,
PR
):
def
iterate
(
self
):
""""""
""""""
V
=
self
.
initial_
value
V
=
self
.
value
done
=
False
done
=
False
...
@@ -1437,18 +1396,15 @@ class ValueIterationGS(MDP):
...
@@ -1437,18 +1396,15 @@ class ValueIterationGS(MDP):
for
s
in
range
(
self
.
S
):
for
s
in
range
(
self
.
S
):
for
a
in
range
(
self
.
A
):
for
a
in
range
(
self
.
A
):
if
iscell
(
P
):
Q
[
a
]
=
self
.
R
[
s
,
a
]
+
self
.
discount
*
self
.
P
[
a
][
s
,:]
*
self
.
value
Q
[
a
]
=
PR
[
s
,
a
]
+
discount
*
P
[
a
][
s
,:]
*
V
self
.
value
[
s
]
=
max
(
Q
)
else
:
Q
[
a
]
=
PR
[
s
,
a
]
+
discount
*
P
[
s
,:,
a
]
*
V
V
[
s
]
=
max
(
Q
)
variation
=
self
.
getSpan
(
V
-
Vprev
)
variation
=
self
.
getSpan
(
V
-
Vprev
)
if
self
.
verbose
:
if
self
.
verbose
:
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
print
(
" %s %s"
%
(
self
.
iter
,
variation
))
if
variation
<
thresh
:
if
variation
<
self
.
thresh
:
done
=
True
done
=
True
if
self
.
verbose
:
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped, epsilon-optimal policy found'
)
print
(
'MDP Toolbox : iterations stopped, epsilon-optimal policy found'
)
...
@@ -1458,13 +1414,10 @@ class ValueIterationGS(MDP):
...
@@ -1458,13 +1414,10 @@ class ValueIterationGS(MDP):
if
self
.
verbose
:
if
self
.
verbose
:
print
(
'MDP Toolbox : iterations stopped by maximum number of iteration condition'
)
print
(
'MDP Toolbox : iterations stopped by maximum number of iteration condition'
)
for
s
in
range
(
S
):
for
s
in
range
(
self
.
S
):
for
a
in
range
(
A
):
for
a
in
range
(
self
.
A
):
if
iscell
(
P
):
Q
[
a
]
=
self
.
R
[
s
,
a
]
+
self
.
P
[
a
][
s
,:]
*
self
.
discount
*
self
.
value
Q
[
a
]
=
PR
[
s
,
a
]
+
P
[
a
][
s
,:]
*
discount
*
V
else
:
Q
[
a
]
=
PR
[
s
,
a
]
+
P
[
s
,:,
a
]
*
discount
*
V
V
[
s
],
policy
[
s
,
1
]
=
max
(
Q
)
self
.
value
[
s
],
self
.
policy
[
s
,
1
]
=
max
(
Q
)
self
.
time
=
time
()
-
self
.
time
self
.
time
=
time
()
-
self
.
time
test_mdptoolbox.py
View file @
60e89bf1
...
@@ -10,7 +10,7 @@ from numpy import array, eye, matrix, zeros
...
@@ -10,7 +10,7 @@ from numpy import array, eye, matrix, zeros
from
numpy.random
import
rand
from
numpy.random
import
rand
from
scipy.sparse
import
eye
as
speye
from
scipy.sparse
import
eye
as
speye
from
scipy.sparse
import
csr_matrix
as
sparse
from
scipy.sparse
import
csr_matrix
as
sparse
from
scipy.stats.distributions
import
poisson
#
from scipy.stats.distributions import poisson
inst
=
MDP
()
inst
=
MDP
()
...
@@ -144,39 +144,50 @@ def test_ValueIteration_boundIter():
...
@@ -144,39 +144,50 @@ def test_ValueIteration_boundIter():
inst
=
ValueIteration
(
P
,
R
,
0.9
,
0.01
)
inst
=
ValueIteration
(
P
,
R
,
0.9
,
0.01
)
assert
(
inst
.
max_iter
==
28
)
assert
(
inst
.
max_iter
==
28
)
def
test_JacksCarRental
():
# PolicyIteration
S
=
21
**
2
def
test_PolicyIteration
():
A
=
11
P
=
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
P
=
zeros
((
A
,
S
,
S
))
R
=
array
([[
5
,
10
],
[
-
1
,
2
]])
R
=
zeros
((
A
,
S
,
S
))
for
a
in
range
(
A
):
for
s
in
range
(
21
):
for
s1
in
range
(
21
):
c1s
=
int
(
s
/
21
)
c2s
=
s
-
c1s
*
21
c1s1
=
int
(
s1
/
21
)
c2s1
=
s
-
c1s
*
21
cs
=
c1s
+
c2s
cs1
=
c1s1
+
c2s1
netmove
=
5
-
a
if
(
s1
<
s
):
pass
else
:
pass
P
[
a
,
s
,
s1
]
=
1
R
[
a
,
s
,
s1
]
=
10
*
(
cs
-
cs1
)
-
2
*
abs
(
a
)
inst
=
PolicyIteration
(
P
,
R
,
0.9
)
inst
=
PolicyIteration
(
P
,
R
,
0.9
)
inst
.
iterate
()
inst
.
iterate
()
#assert (inst.policy == )
assert
(
abs
(
inst
.
value
[
0
]
-
42.4419
)
<
0.001
)
assert
(
abs
(
inst
.
value
[
1
]
-
36.0465
)
<
0.001
)
def
test_JacksCarRental2
():
assert
(
inst
.
policy
==
(
1
,
0
))
pass
assert
(
inst
.
iter
==
2
)
def
test_GamblersProblem
():
#def test_JacksCarRental():
inst
=
ValueIteration
()
# S = 21 ** 2
inst
.
iterate
()
# A = 11
#assert (inst.policy == )
# P = zeros((A, S, S))
# R = zeros((A, S, S))
# for a in range(A):
# for s in range(21):
# for s1 in range(21):
# c1s = int(s / 21)
# c2s = s - c1s * 21
# c1s1 = int(s1 / 21)
# c2s1 = s - c1s * 21
# cs = c1s + c2s
# cs1 = c1s1 + c2s1
# netmove = 5 - a
# if (s1 < s):
# pass
# else:
# pass
# P[a, s, s1] = 1
# R[a, s, s1] = 10 * (cs - cs1) - 2 * abs(a)
#
# inst = PolicyIteration(P, R, 0.9)
# inst.iterate()
# #assert (inst.policy == )
#
#def test_JacksCarRental2():
# pass
#
#def test_GamblersProblem():
# inst = ValueIteration()
# inst.iterate()
# #assert (inst.policy == )
# checkSquareStochastic: not square, stochastic and non-negative
# checkSquareStochastic: not square, stochastic and non-negative
...
...
Write
Preview