Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Zahra Rajabi
pymdptoolbox
Commits
b7858639
Commit
b7858639
authored
Jan 07, 2015
by
Steven Cordwell
Browse files
Make cosmetic changes to improve style of code
parent
9a04a050
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/mdptoolbox/example.py
View file @
b7858639
...
...
@@ -164,11 +164,11 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
rows
=
list
(
range
(
S
))
*
2
cols
=
[
0
]
*
S
+
list
(
range
(
1
,
S
))
+
[
S
-
1
]
vals
=
[
p
]
*
S
+
[
1
-
p
]
*
S
P
.
append
(
_sp
.
coo_matrix
((
vals
,
(
rows
,
cols
)),
shape
=
(
S
,
S
)).
tocsr
())
P
.
append
(
_sp
.
coo_matrix
((
vals
,
(
rows
,
cols
)),
shape
=
(
S
,
S
)).
tocsr
())
rows
=
list
(
range
(
S
))
cols
=
[
0
]
*
S
vals
=
[
1
]
*
S
P
.
append
(
_sp
.
coo_matrix
((
vals
,
(
rows
,
cols
)),
shape
=
(
S
,
S
)).
tocsr
())
P
.
append
(
_sp
.
coo_matrix
((
vals
,
(
rows
,
cols
)),
shape
=
(
S
,
S
)).
tocsr
())
else
:
P
=
_np
.
zeros
((
2
,
S
,
S
))
P
[
0
,
:,
:]
=
(
1
-
p
)
*
_np
.
diag
(
_np
.
ones
(
S
-
1
),
1
)
...
...
@@ -182,7 +182,6 @@ def forest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
R
[:,
1
]
=
_np
.
ones
(
S
)
R
[
0
,
1
]
=
0
R
[
S
-
1
,
1
]
=
r2
# we want to return the generated transition and reward matrices
return
(
P
,
R
)
def
rand
(
S
,
A
,
is_sparse
=
False
,
mask
=
None
):
...
...
@@ -338,12 +337,11 @@ def rand(S, A, is_sparse=False, mask=None):
P
[
a
][
s
]
=
P
[
a
][
s
]
/
P
[
a
][
s
].
sum
()
R
[
a
][
s
]
=
(
m
*
(
2
*
_np
.
random
.
random
(
S
)
-
_np
.
ones
(
S
,
dtype
=
int
)))
# we want to return the generated transition and reward matrices
return
(
P
,
R
)
def
small
():
"""A very small Markov decision process.
The probability transition matrices are::
| | 0.5 0.5 | |
...
...
@@ -356,7 +354,7 @@ def small():
R = | 5 10 |
| -1 2 |
Returns
=======
out : tuple
...
...
@@ -378,6 +376,6 @@ def small():
[-1, 2]])
"""
P
=
_np
.
array
([[[
0.5
,
0.5
],[
0.8
,
0.2
]],[[
0
,
1
],[
0.1
,
0.9
]]])
P
=
_np
.
array
([[[
0.5
,
0.5
],
[
0.8
,
0.2
]],
[[
0
,
1
],
[
0.1
,
0.9
]]])
R
=
_np
.
array
([[
5
,
10
],
[
-
1
,
2
]])
return
(
P
,
R
)
src/mdptoolbox/mdp.py
View file @
b7858639
...
...
@@ -246,7 +246,7 @@ class MDP(object):
if
P
.
ndim
==
3
:
self
.
S
=
P
.
shape
[
1
]
else
:
self
.
S
=
P
[
0
].
shape
[
0
]
self
.
S
=
P
[
0
].
shape
[
0
]
except
AttributeError
:
self
.
S
=
P
[
0
].
shape
[
0
]
# convert P to a tuple of numpy arrays
...
...
@@ -281,14 +281,14 @@ class MDP(object):
self
.
R
=
tuple
(
r
for
aa
in
range
(
self
.
A
))
elif
R
.
ndim
==
2
:
self
.
R
=
tuple
(
_np
.
array
(
R
[:,
aa
]).
reshape
(
self
.
S
)
for
aa
in
range
(
self
.
A
))
for
aa
in
range
(
self
.
A
))
else
:
self
.
R
=
tuple
(
_np
.
multiply
(
P
[
aa
],
R
[
aa
]).
sum
(
1
).
reshape
(
self
.
S
)
for
aa
in
range
(
self
.
A
))
for
aa
in
range
(
self
.
A
))
except
AttributeError
:
if
len
(
R
)
==
self
.
A
:
self
.
R
=
tuple
(
_np
.
multiply
(
P
[
aa
],
R
[
aa
]).
sum
(
1
).
reshape
(
self
.
S
)
for
aa
in
range
(
self
.
A
))
for
aa
in
range
(
self
.
A
))
else
:
r
=
_np
.
array
(
R
).
reshape
(
self
.
S
)
self
.
R
=
tuple
(
r
for
aa
in
range
(
self
.
A
))
...
...
@@ -375,8 +375,6 @@ class FiniteHorizon(MDP):
# Set the reward for the final transition to h, if specified.
if
h
is
not
None
:
self
.
V
[:,
N
]
=
h
# Call the iteration method
#self.run()
def
run
(
self
):
# Run the finite horizon algorithm.
...
...
@@ -459,8 +457,6 @@ class LP(MDP):
# this doesn't do what I want it to do c.f. issue #3
if
not
self
.
verbose
:
solvers
.
options
[
'show_progress'
]
=
False
# Call the iteration method
#self.run()
def
run
(
self
):
#Run the linear programming algorithm.
...
...
@@ -488,7 +484,7 @@ class LP(MDP):
# only to 10e-8 places. This assumes glpk is installed of course.
self
.
V
=
_np
.
array
(
self
.
_linprog
(
f
,
M
,
-
h
)[
'x'
]).
reshape
(
self
.
S
)
# apply the Bellman operator
self
.
policy
,
self
.
V
=
self
.
_bellmanOperator
()
self
.
policy
,
self
.
V
=
self
.
_bellmanOperator
()
# update the time spent solving
self
.
time
=
_time
.
time
()
-
self
.
time
# store value and policy as tuples
...
...
@@ -560,7 +556,7 @@ class PolicyIteration(MDP):
# Set up the MDP, but don't need to worry about epsilon values
MDP
.
__init__
(
self
,
transitions
,
reward
,
discount
,
None
,
max_iter
)
# Check if the user has supplied an initial policy. If not make one.
if
policy0
==
None
:
if
policy0
is
None
:
# Initialise the policy to the one which maximises the expected
# immediate reward
null
=
_np
.
zeros
(
self
.
S
)
...
...
@@ -592,8 +588,6 @@ class PolicyIteration(MDP):
raise
ValueError
(
"'eval_type' should be '0' for matrix evaluation "
"or '1' for iterative evaluation. The strings "
"'matrix' and 'iterative' can also be used."
)
# Call the iteration method
#self.run()
def
_computePpolicyPRpolicy
(
self
):
# Compute the transition matrix and the reward matrix for a policy.
...
...
@@ -768,7 +762,7 @@ class PolicyIteration(MDP):
done
=
True
if
self
.
verbose
:
print
(
_MSG_STOP_UNCHANGING_POLICY
)
elif
(
self
.
iter
==
self
.
max_iter
)
:
elif
self
.
iter
==
self
.
max_iter
:
done
=
True
if
self
.
verbose
:
print
(
_MSG_STOP_MAX_ITER
)
...
...
@@ -857,9 +851,6 @@ class PolicyIterationModified(PolicyIteration):
Rmin
=
min
(
R
.
min
()
for
R
in
self
.
R
)
self
.
V
=
1
/
(
1
-
self
.
discount
)
*
Rmin
*
_np
.
ones
((
self
.
S
,))
# Call the iteration method
#self.run()
def
run
(
self
):
# Run the modified policy iteration algorithm.
...
...
@@ -991,9 +982,6 @@ class QLearning(MDP):
self
.
Q
=
_np
.
zeros
((
self
.
S
,
self
.
A
))
self
.
mean_discrepancy
=
[]
# Call the iteration method
#self.run()
def
run
(
self
):
# Run the Q-learning algoritm.
discrepancy
=
[]
...
...
@@ -1006,13 +994,13 @@ class QLearning(MDP):
for
n
in
range
(
1
,
self
.
max_iter
+
1
):
# Reinitialisation of trajectories every 100 transitions
if
(
(
n
%
100
)
==
0
)
:
if
(
n
%
100
)
==
0
:
s
=
_np
.
random
.
randint
(
0
,
self
.
S
)
# Action choice : greedy with increasing probability
# probability 1-(1/log(n+2)) can be changed
pn
=
_np
.
random
.
random
()
if
(
pn
<
(
1
-
(
1
/
_math
.
log
(
n
+
2
)))
)
:
if
pn
<
(
1
-
(
1
/
_math
.
log
(
n
+
2
))):
# optimal_action = self.Q[s, :].max()
a
=
self
.
Q
[
s
,
:].
argmax
()
else
:
...
...
@@ -1022,7 +1010,7 @@ class QLearning(MDP):
p_s_new
=
_np
.
random
.
random
()
p
=
0
s_new
=
-
1
while
(
(
p
<
p_s_new
)
and
(
s_new
<
(
self
.
S
-
1
))
)
:
while
(
p
<
p_s_new
)
and
(
s_new
<
(
self
.
S
-
1
)):
s_new
=
s_new
+
1
p
=
p
+
self
.
P
[
a
][
s
,
s_new
]
...
...
@@ -1139,9 +1127,6 @@ class RelativeValueIteration(MDP):
self
.
average_reward
=
None
# Call the iteration method
#self.run()
def
run
(
self
):
# Run the relative value iteration algorithm.
...
...
@@ -1153,7 +1138,7 @@ class RelativeValueIteration(MDP):
while
not
done
:
self
.
iter
+=
1
;
self
.
iter
+=
1
self
.
policy
,
Vnext
=
self
.
_bellmanOperator
()
Vnext
=
Vnext
-
self
.
gain
...
...
@@ -1164,15 +1149,15 @@ class RelativeValueIteration(MDP):
print
((
" %s
\t\t
%s"
%
(
self
.
iter
,
variation
)))
if
variation
<
self
.
epsilon
:
done
=
True
self
.
average_reward
=
self
.
gain
+
(
Vnext
-
self
.
V
).
min
()
if
self
.
verbose
:
print
(
_MSG_STOP_EPSILON_OPTIMAL_POLICY
)
done
=
True
self
.
average_reward
=
self
.
gain
+
(
Vnext
-
self
.
V
).
min
()
if
self
.
verbose
:
print
(
_MSG_STOP_EPSILON_OPTIMAL_POLICY
)
elif
self
.
iter
==
self
.
max_iter
:
done
=
True
self
.
average_reward
=
self
.
gain
+
(
Vnext
-
self
.
V
).
min
()
if
self
.
verbose
:
print
(
_MSG_STOP_MAX_ITER
)
done
=
True
self
.
average_reward
=
self
.
gain
+
(
Vnext
-
self
.
V
).
min
()
if
self
.
verbose
:
print
(
_MSG_STOP_MAX_ITER
)
self
.
V
=
Vnext
self
.
gain
=
float
(
self
.
V
[
self
.
S
-
1
])
...
...
@@ -1320,9 +1305,6 @@ class ValueIteration(MDP):
# threshold of variation for V for an epsilon-optimal policy
self
.
thresh
=
epsilon
# Call the iteration method
#self.run()
def
_boundIter
(
self
,
epsilon
):
# Compute a bound for the number of iterations.
#
...
...
@@ -1395,7 +1377,7 @@ class ValueIteration(MDP):
if
self
.
verbose
:
print
(
_MSG_STOP_EPSILON_OPTIMAL_POLICY
)
break
elif
(
self
.
iter
==
self
.
max_iter
)
:
elif
self
.
iter
==
self
.
max_iter
:
if
self
.
verbose
:
print
(
_MSG_STOP_MAX_ITER
)
break
...
...
@@ -1491,9 +1473,6 @@ class ValueIterationGS(ValueIteration):
# threshold of variation for V for an epsilon-optimal policy
self
.
thresh
=
epsilon
# Call the iteration method
#self.run()
def
run
(
self
):
# Run the value iteration Gauss-Seidel algorithm.
...
...
@@ -1534,7 +1513,7 @@ class ValueIterationGS(ValueIteration):
for
s
in
range
(
self
.
S
):
Q
=
_np
.
zeros
(
self
.
A
)
for
a
in
range
(
self
.
A
):
Q
[
a
]
=
self
.
R
[
a
][
s
]
+
self
.
discount
*
self
.
P
[
a
][
s
,:].
dot
(
self
.
V
)
Q
[
a
]
=
self
.
R
[
a
][
s
]
+
self
.
discount
*
self
.
P
[
a
][
s
,
:].
dot
(
self
.
V
)
self
.
V
[
s
]
=
Q
.
max
()
self
.
policy
.
append
(
int
(
Q
.
argmax
()))
...
...
src/mdptoolbox/util.py
View file @
b7858639
...
...
@@ -19,12 +19,12 @@ getSpan
# Copyright (c) 2011-2013 Steven A. W. Cordwell
# Copyright (c) 2009 INRA
#
#
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
...
...
@@ -33,7 +33,7 @@ getSpan
# * Neither the name of the <ORGANIZATION> nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
...
...
@@ -49,7 +49,7 @@ getSpan
import
numpy
as
_np
# These need to be fixed so that we use classes derived from Error.
mdperr
=
{
MDPERR
=
{
"mat_nonneg"
:
"Transition probabilities must be non-negative."
,
"mat_square"
:
...
...
@@ -84,9 +84,9 @@ mdperr = {
def
check
(
P
,
R
):
"""Check if ``P`` and ``R`` define a valid Markov Decision Process (MDP).
Let ``S`` = number of states, ``A`` = number of actions.
Parameters
---------
P : array
...
...
@@ -99,18 +99,18 @@ def check(P, R):
shape of (S, A, A). It can also be a one dimensional array with a
shape of (A, ), where each element contains matrix with a shape of
(S, S) which can possibly be sparse. It can also be an array with
a shape of (S, A) which can possibly be sparse.
a shape of (S, A) which can possibly be sparse.
Notes
-----
Raises an error if ``P`` and ``R`` do not define a MDP.
Examples
--------
>>> import mdptoolbox, mdptoolbox.example
>>> P_valid, R_valid = mdptoolbox.example.rand(100, 5)
>>> mdptoolbox.util.check(P_valid, R_valid) # Nothing should happen
>>>
>>>
>>> import numpy as np
>>> P_invalid = np.random.rand(5, 100, 100)
>>> mdptoolbox.util.check(P_invalid, R_valid) # Raises an exception
...
...
@@ -128,7 +128,7 @@ def check(P, R):
# continue checking from there
raise
AttributeError
else
:
raise
InvalidMDPError
(
mdperr
[
"P_shape"
])
raise
InvalidMDPError
(
MDPERR
[
"P_shape"
])
except
AttributeError
:
try
:
aP
=
len
(
P
)
...
...
@@ -136,9 +136,9 @@ def check(P, R):
for
aa
in
range
(
1
,
aP
):
sP0aa
,
sP1aa
=
P
[
aa
].
shape
if
(
sP0aa
!=
sP0
)
or
(
sP1aa
!=
sP1
):
raise
InvalidMDPError
(
mdperr
[
"obj_square"
])
raise
InvalidMDPError
(
MDPERR
[
"obj_square"
])
except
AttributeError
:
raise
InvalidMDPError
(
mdperr
[
"P_shape"
])
raise
InvalidMDPError
(
MDPERR
[
"P_shape"
])
# Checking R
try
:
ndimR
=
R
.
ndim
...
...
@@ -151,7 +151,7 @@ def check(P, R):
elif
ndimR
==
3
:
aR
,
sR0
,
sR1
=
R
.
shape
else
:
raise
InvalidMDPError
(
mdperr
[
"R_shape"
])
raise
InvalidMDPError
(
MDPERR
[
"R_shape"
])
except
AttributeError
:
try
:
lenR
=
len
(
R
)
...
...
@@ -160,15 +160,15 @@ def check(P, R):
sR0
,
sR1
=
R
[
0
].
shape
for
aa
in
range
(
1
,
aR
):
sR0aa
,
sR1aa
=
R
[
aa
].
shape
if
(
(
sR0aa
!=
sR0
)
or
(
sR1aa
!=
sR1
)
)
:
raise
InvalidMDPError
(
mdperr
[
"obj_square"
])
if
(
sR0aa
!=
sR0
)
or
(
sR1aa
!=
sR1
):
raise
InvalidMDPError
(
MDPERR
[
"obj_square"
])
elif
lenR
==
sP0
:
aR
=
aP
sR0
=
sR1
=
lenR
else
:
raise
InvalidMDPError
(
mdperr
[
"R_shape"
])
raise
InvalidMDPError
(
MDPERR
[
"R_shape"
])
except
AttributeError
:
raise
InvalidMDPError
(
mdperr
[
"R_shape"
])
raise
InvalidMDPError
(
MDPERR
[
"R_shape"
])
# Checking dimensions
assert
sP0
>
0
,
"The number of states in P must be greater than 0."
assert
aP
>
0
,
"The number of actions in P must be greater than 0."
...
...
@@ -183,13 +183,12 @@ def check(P, R):
checkSquareStochastic
(
P
[
aa
])
# We are at the end of the checks, so if no exceptions have been raised
# then that means there are (hopefullly) no errors and we return None
return
None
# These are the old code comments, which need to be converted to
# information in the docstring:
#
# tranitions must be a numpy array either an AxSxS ndarray (with any
# dtype other than "object"); or, a 1xA ndarray with a "object" dtype,
# tranitions must be a numpy array either an AxSxS ndarray (with any
# dtype other than "object"); or, a 1xA ndarray with a "object" dtype,
# and each element containing an SxS array. An AxSxS array will be
# be converted to an object array. A numpy object array is similar to a
# MATLAB cell array.
...
...
@@ -208,7 +207,7 @@ def check(P, R):
# As above but for the reward array. A difference is that the reward
# array can have either two or 3 dimensions.
#
# We want to make sure that the transition probability array and the
# We want to make sure that the transition probability array and the
# reward array are in agreement. This means that both should show that
# there are the same number of actions and the same number of states.
# Furthermore the probability of transition matrices must be SxS in
...
...
@@ -238,7 +237,7 @@ def check(P, R):
# telling the user what needs to be fixed.
#
# if we are using a normal array for this, then the first
# dimension should be the number of actions, and the second and
# dimension should be the number of actions, and the second and
# third should be the number of states
#
# the first dimension of the transition matrix must report the same
...
...
@@ -253,14 +252,14 @@ def check(P, R):
# normal arrays this is a matrix formed by taking a slice of the array
#
# if the rewarad array has an object dtype, then we check that
# each element contains a matrix of the same shape as we did
# each element contains a matrix of the same shape as we did
# above with the transition array.
#
# This indicates that the reward matrices are constructed per
# This indicates that the reward matrices are constructed per
# transition, so that the first dimension is the actions and
# the second two dimensions are the states.
#
# then the reward matrix is per state, so the first dimension is
# then the reward matrix is per state, so the first dimension is
# the states and the second dimension is the actions.
#
# this is added just so that the next check doesn't error out
...
...
@@ -279,19 +278,19 @@ def rowsSumToOne(Z, n):
def
checkSquareStochastic
(
Z
):
"""Check if Z is a square stochastic matrix.
Let S = number of states.
Parameters
----------
Z : matrix
This should be a two dimensional array with a shape of (S, S). It can
possibly be sparse.
Notes
Notes
----------
Returns None if no error has been detected, else it raises an error.
"""
# try to get the shape of the matrix
try
:
...
...
@@ -299,42 +298,40 @@ def checkSquareStochastic(Z):
except
AttributeError
:
raise
TypeError
(
"Matrix should be a numpy type."
)
except
ValueError
:
raise
InvalidMDPError
(
mdperr
[
"mat_square"
])
raise
InvalidMDPError
(
MDPERR
[
"mat_square"
])
# check that the matrix is square, and that each row sums to one
assert
s1
==
s2
,
mdperr
[
"mat_square"
]
assert
rowsSumToOne
(
Z
,
s2
),
mdperr
[
"mat_stoch"
]
assert
s1
==
s2
,
MDPERR
[
"mat_square"
]
assert
rowsSumToOne
(
Z
,
s2
),
MDPERR
[
"mat_stoch"
]
# make sure that there are no values less than zero
try
:
assert
(
Z
>=
0
).
all
(),
mdperr
[
"mat_nonneg"
]
assert
(
Z
>=
0
).
all
(),
MDPERR
[
"mat_nonneg"
]
except
(
NotImplementedError
,
AttributeError
,
TypeError
):
try
:
assert
(
Z
.
data
>=
0
).
all
(),
mdperr
[
"mat_nonneg"
]
assert
(
Z
.
data
>=
0
).
all
(),
MDPERR
[
"mat_nonneg"
]
except
AttributeError
:
raise
TypeError
(
"Matrix should be a numpy type."
)
return
(
None
)
def
getSpan
(
W
):
"""Return the span of W
sp(W) = max W(s) - min W(s)
"""
return
(
W
.
max
()
-
W
.
min
())
return
(
W
.
max
()
-
W
.
min
())
class
Error
(
Exception
):
"""Base class for exceptions in this module."""
def
__init__
(
self
):
Exception
.
__init__
(
self
)
self
.
message
=
"PyMDPToolbox: "
def
__str__
(
self
):
return
repr
(
self
.
message
)
class
InvalidMDPError
(
Error
):
"""Class for invalid definitions of a MDP."""
def
__init__
(
self
,
msg
):
Error
.
__init__
(
self
)
self
.
message
+=
msg
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment